From 9d59dc4394ae607bcef6205974f59de48f1094c2 Mon Sep 17 00:00:00 2001 From: Ludovic Henry Date: Sun, 10 May 2026 18:16:35 +0000 Subject: [PATCH 01/10] Add raw_github_probe and dns_probe textfile scripts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These cover the two metrics in the debugging plan that no built-in node_exporter collector exposes: raw_github_probe.py (5 min timer) Downloads a fixed artefact from raw.githubusercontent.com (Fastly, customer's actual path) and from speed.cloudflare.com (non-Fastly control). curl is shelled out so we get %{remote_ip} reflecting the IP libcurl actually connected to. Emits raw_github_probe_seconds, _bytes_per_second and _curl_exit_code labelled by target and remote_ip. When the Fastly target sags but the Cloudflare control stays flat, the issue is on the Scaleway-Fastly path (H9), not Scaleway WAN egress generally (H5). dns_probe.py (60 s timer) Resolves raw.githubusercontent.com via socket.getaddrinfo (same resolver libc/curl uses) and emits an info-style series runner_dns_resolved_ip{ip="..."}=1 plus a count. Lets us correlate slow windows with cache-region or POP flips (H1). Both scripts run as the node_exporter user, write atomically (mkstemp + os.replace) to the textfile collector dir, and self-recover from probe failures by emitting curl_exit_code=99 instead of skipping the row. Stdlib only — no extra apt packages needed. https://claude.ai/code/session_01BgToCb8eDGsrkyTddCtt9Z --- scripts/scw.py | 216 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 216 insertions(+) diff --git a/scripts/scw.py b/scripts/scw.py index 34f3567..ff78161 100644 --- a/scripts/scw.py +++ b/scripts/scw.py @@ -864,6 +864,222 @@ def is_ready(res): sudo systemctl daemon-reload sudo systemctl enable prometheus-agent +############################################################################### +## Install probe scripts (textfile collector) + +# Two scripts cover the metrics in the debugging plan that no built-in +# node_exporter collector exposes: +# - raw_github_probe.py: end-to-end download timing against +# raw.githubusercontent.com plus a non-Fastly comparison target. The +# headline metric for the slow-CI investigation. +# - dns_probe.py: which Fastly IPs raw.githubusercontent.com is currently +# resolving to, so we can correlate slow windows with cache-region flips. +# +# Implemented in Python (stdlib only). curl is shelled out for the +# raw_github probe so we get %{remote_ip} reflecting the IP libcurl +# actually connected to (matches the customer's traffic path more +# faithfully than what socket.gethostbyname would tell us). + +cat <<'SCRIPT_EOF' | sudo tee /usr/local/bin/raw_github_probe.py >/dev/null +#!/usr/bin/env python3 +# Synthetic probe: download a fixed test artefact from +# raw.githubusercontent.com and a non-Fastly comparison target, +# emit timing metrics for the node_exporter textfile collector. +import os +import subprocess +import tempfile +from pathlib import Path + +OUT = Path("/var/lib/node_exporter/textfile_collector/raw_github_probe.prom") + +TARGETS = [ + # Fastly target: small stable file in the same repo as the customer's + # slow downloads (they fetch from usnistgov/ACVP-Server). + ("raw.githubusercontent.com", + "https://raw.githubusercontent.com/usnistgov/ACVP-Server/master/README.md"), + # Non-Fastly comparison: Cloudflare's well-known speed-test endpoint. + # When the Fastly target sags but this stays flat, the issue is on the + # Scaleway-Fastly path (H9), not Scaleway WAN egress in general (H5). + ("cloudflare-control", + "https://speed.cloudflare.com/__down?bytes=1048576"), +] + + +def probe(target: str, url: str) -> list[str]: + try: + result = subprocess.run( + [ + "curl", "-o", "/dev/null", "-s", "--max-time", "30", + "-w", "%{time_total} %{speed_download} %{remote_ip} %{exitcode}\n", + url, + ], + capture_output=True, text=True, timeout=35, + ) + fields = (result.stdout.strip() or "0 0 unknown 99").split() + except (subprocess.TimeoutExpired, FileNotFoundError): + fields = ["0", "0", "unknown", "99"] + fields = (fields + ["0", "0", "unknown", "99"])[:4] + time_total, speed, remote_ip, exit_code = fields + return [ + f'raw_github_probe_seconds{{target="{target}",remote_ip="{remote_ip}"}} {time_total}', + f'raw_github_probe_bytes_per_second{{target="{target}"}} {speed}', + f'raw_github_probe_curl_exit_code{{target="{target}"}} {exit_code}', + ] + + +def main() -> None: + lines = [ + "# HELP raw_github_probe_seconds Wallclock to download a fixed test artefact.", + "# TYPE raw_github_probe_seconds gauge", + "# HELP raw_github_probe_bytes_per_second Average download throughput in bytes/sec.", + "# TYPE raw_github_probe_bytes_per_second gauge", + "# HELP raw_github_probe_curl_exit_code Curl exit code; 0 on success.", + "# TYPE raw_github_probe_curl_exit_code gauge", + ] + for target, url in TARGETS: + lines.extend(probe(target, url)) + + OUT.parent.mkdir(parents=True, exist_ok=True) + fd, tmp = tempfile.mkstemp(dir=str(OUT.parent), prefix=".raw_github_probe.") + try: + with os.fdopen(fd, "w") as f: + f.write("\n".join(lines) + "\n") + os.replace(tmp, OUT) + except Exception: + try: + os.unlink(tmp) + except FileNotFoundError: + pass + raise + + +if __name__ == "__main__": + main() +SCRIPT_EOF +sudo chmod 0755 /usr/local/bin/raw_github_probe.py +sudo chown root:root /usr/local/bin/raw_github_probe.py + +cat <<'SCRIPT_EOF' | sudo tee /usr/local/bin/dns_probe.py >/dev/null +#!/usr/bin/env python3 +# DNS resolution snapshot for raw.githubusercontent.com — emit info-style +# metrics so we can see which Fastly IPs each node sees over time. +# +# Resolves via socket.getaddrinfo, which uses the same resolver libc +# would use, so the IPs we record are the same ones curl/the runner +# agent would actually connect to. +import os +import socket +import tempfile +from pathlib import Path + +OUT = Path("/var/lib/node_exporter/textfile_collector/dns_probe.prom") +HOST = "raw.githubusercontent.com" + + +def resolve(host: str) -> list[str]: + ips: set[str] = set() + for family in (socket.AF_INET, socket.AF_INET6): + try: + for info in socket.getaddrinfo(host, None, family, socket.SOCK_STREAM): + ips.add(info[4][0]) + except socket.gaierror: + pass + return sorted(ips) + + +def main() -> None: + ips = resolve(HOST) + lines = [ + "# HELP runner_dns_resolved_ip Info metric: 1 per IP currently resolved for HOST.", + "# TYPE runner_dns_resolved_ip gauge", + "# HELP runner_dns_resolved_ip_count Number of IPs returned for HOST.", + "# TYPE runner_dns_resolved_ip_count gauge", + ] + for ip in ips: + lines.append(f'runner_dns_resolved_ip{{host="{HOST}",ip="{ip}"}} 1') + lines.append(f'runner_dns_resolved_ip_count{{host="{HOST}"}} {len(ips)}') + + OUT.parent.mkdir(parents=True, exist_ok=True) + fd, tmp = tempfile.mkstemp(dir=str(OUT.parent), prefix=".dns_probe.") + try: + with os.fdopen(fd, "w") as f: + f.write("\n".join(lines) + "\n") + os.replace(tmp, OUT) + except Exception: + try: + os.unlink(tmp) + except FileNotFoundError: + pass + raise + + +if __name__ == "__main__": + main() +SCRIPT_EOF +sudo chmod 0755 /usr/local/bin/dns_probe.py +sudo chown root:root /usr/local/bin/dns_probe.py + +# Systemd timers running each probe as the node_exporter user. +cat <<'EOF' | sudo tee /etc/systemd/system/raw-github-probe.service +[Unit] +Description=Synthetic probe of raw.githubusercontent.com and a comparison target +After=network-online.target +Wants=network-online.target +[Service] +Type=oneshot +User=node_exporter +Group=node_exporter +ExecStart=/usr/local/bin/raw_github_probe.py +NoNewPrivileges=true +ProtectSystem=strict +ProtectHome=true +PrivateTmp=true +ReadWritePaths=/var/lib/node_exporter +EOF + +cat <<'EOF' | sudo tee /etc/systemd/system/raw-github-probe.timer +[Unit] +Description=Run raw-github-probe every 5 minutes +[Timer] +OnBootSec=1min +OnUnitActiveSec=5min +Unit=raw-github-probe.service +[Install] +WantedBy=timers.target +EOF + +cat <<'EOF' | sudo tee /etc/systemd/system/dns-probe.service +[Unit] +Description=DNS resolution snapshot for raw.githubusercontent.com +After=network-online.target +Wants=network-online.target +[Service] +Type=oneshot +User=node_exporter +Group=node_exporter +ExecStart=/usr/local/bin/dns_probe.py +NoNewPrivileges=true +ProtectSystem=strict +ProtectHome=true +PrivateTmp=true +ReadWritePaths=/var/lib/node_exporter +EOF + +cat <<'EOF' | sudo tee /etc/systemd/system/dns-probe.timer +[Unit] +Description=Run dns-probe every 60 seconds +[Timer] +OnBootSec=30s +OnUnitActiveSec=60s +Unit=dns-probe.service +[Install] +WantedBy=timers.target +EOF + +sudo systemctl daemon-reload +sudo systemctl enable raw-github-probe.timer +sudo systemctl enable dns-probe.timer + ############################################################################### ## Install containerd From bb57899694750b598dc543d879caac00db10050f Mon Sep 17 00:00:00 2001 From: Ludovic Henry Date: Sun, 10 May 2026 18:16:36 +0000 Subject: [PATCH 02/10] Add Grafana dashboard for buckets A and B (v2 schema) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ten panels organized by hypothesis (H10, H9, sentinels), in the Grafana v2alpha dashboard schema (kind/spec wrappers, AutoGridLayout, elements map keyed by panel-N) used by Scaleway's managed Grafana. H10 (single-queue NIC / CPU0 softirq saturation) - CPU softirq% by CPU (default cpu collector) - NET_RX softirq deliveries/sec (--collector.softirqs) - end0 hard IRQs/sec by CPU (--collector.interrupts) H9 (TCP-stack health, Scaleway-Fastly path proxy) - TCPTimeouts/sec - Retransmit ratio with 0.5% threshold - Lost retransmits & spurious RTOs Sentinels - end0 throughput vs 100 Mbps line - end0 NIC drops & errors (H2 should stay flat) - Conntrack utilisation (H3 should stay << 0.01) - EEE / LPI enter rate (H7 fallback; --collector.ethtool) Datasource UID hardcoded to fflnugavx2h34c (Scaleway Cockpit metrics data source for this project). Layout is AutoGridLayout with three columns; panels flow into four rows. No template variables yet — adding a node selector requires a v2-schema sample. https://claude.ai/code/session_01BgToCb8eDGsrkyTddCtt9Z --- dashboards/network-health-buckets-a-b.json | 827 +++++++++++++++++++++ 1 file changed, 827 insertions(+) create mode 100644 dashboards/network-health-buckets-a-b.json diff --git a/dashboards/network-health-buckets-a-b.json b/dashboards/network-health-buckets-a-b.json new file mode 100644 index 0000000..9fbfc90 --- /dev/null +++ b/dashboards/network-health-buckets-a-b.json @@ -0,0 +1,827 @@ +{ + "annotations": [ + { + "kind": "AnnotationQuery", + "spec": { + "builtIn": true, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "query": { + "datasource": { "name": "-- Grafana --" }, + "group": "grafana", + "kind": "DataQuery", + "spec": {}, + "version": "v0" + } + } + } + ], + "cursorSync": "Off", + "editable": true, + "elements": { + "panel-1": { + "kind": "Panel", + "spec": { + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "rate(node_cpu_seconds_total{mode=\"softirq\"}[1m])", + "legendFormat": "{{node}} cpu={{cpu}}", + "range": true + }, + "version": "v0" + }, + "refId": "A" + } + } + ], + "queryOptions": {}, + "transformations": [] + } + }, + "description": "rate(node_cpu_seconds_total{mode=\"softirq\"}). H10 primary signal — CPU0 approaching 1.0 while others stay near 0 = saturation. Threshold line at 0.9.", + "id": 1, + "links": [], + "title": "H10: CPU softirq% by CPU", + "vizConfig": { + "group": "timeseries", + "kind": "VizConfig", + "spec": { + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", + "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", "showValues": false, "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "line" } + }, + "min": 0, + "max": 1, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": 0 }, + { "color": "red", "value": 0.9 } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "options": { + "annotations": { "clustering": -1, "multiLane": false }, + "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } + } + }, + "version": "13.0.1" + } + } + }, + "panel-2": { + "kind": "Panel", + "spec": { + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "rate(node_softirq_total{type=\"NET_RX\"}[1m])", + "legendFormat": "{{node}} cpu={{cpu}}", + "range": true + }, + "version": "v0" + }, + "refId": "A" + } + } + ], + "queryOptions": {}, + "transformations": [] + } + }, + "description": "rate(node_softirq_total{type=\"NET_RX\"}). Cross-check for the CPU softirq% panel. Needs --collector.softirqs.", + "id": 2, + "links": [], + "title": "H10: NET_RX softirq deliveries/sec per CPU", + "vizConfig": { + "group": "timeseries", + "kind": "VizConfig", + "spec": { + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", + "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", "showValues": false, "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] }, + "unit": "ops" + }, + "overrides": [] + }, + "options": { + "annotations": { "clustering": -1, "multiLane": false }, + "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } + } + }, + "version": "13.0.1" + } + } + }, + "panel-3": { + "kind": "Panel", + "spec": { + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "rate(node_interrupts_total{type=~\".*end0.*\"}[1m])", + "legendFormat": "{{node}} cpu={{cpu}} type={{type}}", + "range": true + }, + "version": "v0" + }, + "refId": "A" + } + } + ], + "queryOptions": {}, + "transformations": [] + } + }, + "description": "rate(node_interrupts_total{type=~\".*end0.*\"}). Single-queue light_dwmac_eth pins all end0 IRQs to one CPU; this should structurally show one line > 0 and the rest at 0. Needs --collector.interrupts.", + "id": 3, + "links": [], + "title": "H10: end0 hard IRQs/sec by CPU", + "vizConfig": { + "group": "timeseries", + "kind": "VizConfig", + "spec": { + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", + "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", "showValues": false, "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] }, + "unit": "ops" + }, + "overrides": [] + }, + "options": { + "annotations": { "clustering": -1, "multiLane": false }, + "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } + } + }, + "version": "13.0.1" + } + } + }, + "panel-4": { + "kind": "Panel", + "spec": { + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "rate(node_netstat_TcpExt_TCPTimeouts[5m])", + "legendFormat": "{{node}}", + "range": true + }, + "version": "v0" + }, + "refId": "A" + } + } + ], + "queryOptions": {}, + "transformations": [] + } + }, + "description": "rate(node_netstat_TcpExt_TCPTimeouts). Each timeout = up to 120 s of RTO backoff. H9 primary.", + "id": 4, + "links": [], + "title": "H9: TCPTimeouts/sec", + "vizConfig": { + "group": "timeseries", + "kind": "VizConfig", + "spec": { + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", + "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", "showValues": false, "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "min": 0, + "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] }, + "unit": "ops" + }, + "overrides": [] + }, + "options": { + "annotations": { "clustering": -1, "multiLane": false }, + "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } + } + }, + "version": "13.0.1" + } + } + }, + "panel-5": { + "kind": "Panel", + "spec": { + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "rate(node_netstat_Tcp_RetransSegs[5m]) / clamp_min(rate(node_netstat_Tcp_OutSegs[5m]), 1)", + "legendFormat": "{{node}}", + "range": true + }, + "version": "v0" + }, + "refId": "A" + } + } + ], + "queryOptions": {}, + "transformations": [] + } + }, + "description": "Healthy < 0.5 %; threshold line at 0.005.", + "id": 5, + "links": [], + "title": "H9: Retransmit ratio (RetransSegs / OutSegs)", + "vizConfig": { + "group": "timeseries", + "kind": "VizConfig", + "spec": { + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", + "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", "showValues": false, "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "line" } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": 0 }, + { "color": "red", "value": 0.005 } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "options": { + "annotations": { "clustering": -1, "multiLane": false }, + "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } + } + }, + "version": "13.0.1" + } + } + }, + "panel-6": { + "kind": "Panel", + "spec": { + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "rate(node_netstat_TcpExt_TCPLostRetransmit[5m])", + "legendFormat": "{{node}} lost-retransmit", + "range": true + }, + "version": "v0" + }, + "refId": "A" + } + }, + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "rate(node_netstat_TcpExt_TCPSpuriousRTOs[5m])", + "legendFormat": "{{node}} spurious-rto", + "range": true + }, + "version": "v0" + }, + "refId": "B" + } + } + ], + "queryOptions": {}, + "transformations": [] + } + }, + "description": "TCPLostRetransmit and TCPSpuriousRTOs. Distinguishes real loss from spurious RTO detection.", + "id": 6, + "links": [], + "title": "H9: Lost retransmits & spurious RTOs", + "vizConfig": { + "group": "timeseries", + "kind": "VizConfig", + "spec": { + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", + "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", "showValues": false, "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "min": 0, + "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] }, + "unit": "ops" + }, + "overrides": [] + }, + "options": { + "annotations": { "clustering": -1, "multiLane": false }, + "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } + } + }, + "version": "13.0.1" + } + } + }, + "panel-7": { + "kind": "Panel", + "spec": { + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "rate(node_network_receive_bytes_total{device=\"end0\"}[1m]) * 8 / 1e6", + "legendFormat": "{{node}} rx", + "range": true + }, + "version": "v0" + }, + "refId": "A" + } + }, + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "rate(node_network_transmit_bytes_total{device=\"end0\"}[1m]) * 8 / 1e6", + "legendFormat": "{{node}} tx", + "range": true + }, + "version": "v0" + }, + "refId": "B" + } + } + ], + "queryOptions": {}, + "transformations": [] + } + }, + "description": "rate(node_network_receive/transmit_bytes_total{device=\"end0\"}) * 8 / 1e6. Threshold at 90 Mbps (10 % shy of the 100 Mbps line).", + "id": 7, + "links": [], + "title": "Sentinel: end0 throughput (Mbps) vs 100 Mbps line", + "vizConfig": { + "group": "timeseries", + "kind": "VizConfig", + "spec": { + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", + "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", "showValues": false, "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "line" } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": 0 }, + { "color": "orange", "value": 90 } + ] + }, + "unit": "Mbits" + }, + "overrides": [] + }, + "options": { + "annotations": { "clustering": -1, "multiLane": false }, + "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } + } + }, + "version": "13.0.1" + } + } + }, + "panel-8": { + "kind": "Panel", + "spec": { + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "rate(node_network_receive_drop_total{device=\"end0\"}[5m])", + "legendFormat": "{{node}} rx-drop", + "range": true + }, + "version": "v0" + }, + "refId": "A" + } + }, + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "rate(node_network_receive_errs_total{device=\"end0\"}[5m])", + "legendFormat": "{{node}} rx-errs", + "range": true + }, + "version": "v0" + }, + "refId": "B" + } + } + ], + "queryOptions": {}, + "transformations": [] + } + }, + "description": "rate(node_network_receive_{drop,errs}_total{device=\"end0\"}). Should stay flat at 0; any climb reopens H2.", + "id": 8, + "links": [], + "title": "Sentinel: end0 NIC drops & errors (H2)", + "vizConfig": { + "group": "timeseries", + "kind": "VizConfig", + "spec": { + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", + "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", "showValues": false, "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "min": 0, + "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] }, + "unit": "ops" + }, + "overrides": [] + }, + "options": { + "annotations": { "clustering": -1, "multiLane": false }, + "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } + } + }, + "version": "13.0.1" + } + } + }, + "panel-9": { + "kind": "Panel", + "spec": { + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "node_nf_conntrack_entries / node_nf_conntrack_entries_limit", + "legendFormat": "{{node}}", + "range": true + }, + "version": "v0" + }, + "refId": "A" + } + } + ], + "queryOptions": {}, + "transformations": [] + } + }, + "description": "node_nf_conntrack_entries / _limit. Should stay << 0.01 on a healthy runner; H3 fully refuted on riscv-runner-20.", + "id": 9, + "links": [], + "title": "Sentinel: Conntrack utilisation (H3)", + "vizConfig": { + "group": "timeseries", + "kind": "VizConfig", + "spec": { + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", + "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", "showValues": false, "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "line" } + }, + "min": 0, + "max": 1, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": 0 }, + { "color": "red", "value": 0.5 } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "options": { + "annotations": { "clustering": -1, "multiLane": false }, + "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } + } + }, + "version": "13.0.1" + } + } + }, + "panel-10": { + "kind": "Panel", + "spec": { + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "rate(node_ethtool_irq_tx_path_in_lpi_mode_n{device=\"end0\"}[1m])", + "legendFormat": "{{node}}", + "range": true + }, + "version": "v0" + }, + "refId": "A" + } + } + ], + "queryOptions": {}, + "transformations": [] + } + }, + "description": "rate(node_ethtool_irq_tx_path_in_lpi_mode_n). LPI being entered constantly mostly means link is idle; only interesting if a slow-run window correlates with a marked change. Needs --collector.ethtool.", + "id": 10, + "links": [], + "title": "Sentinel: EEE / LPI enter rate (H7 fallback)", + "vizConfig": { + "group": "timeseries", + "kind": "VizConfig", + "spec": { + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", + "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", "showValues": false, "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "min": 0, + "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] }, + "unit": "ops" + }, + "overrides": [] + }, + "options": { + "annotations": { "clustering": -1, "multiLane": false }, + "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } + } + }, + "version": "13.0.1" + } + } + } + }, + "layout": { + "kind": "AutoGridLayout", + "spec": { + "columnWidthMode": "standard", + "items": [ + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-1" } } }, + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-2" } } }, + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-3" } } }, + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-4" } } }, + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-5" } } }, + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-6" } } }, + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-7" } } }, + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-8" } } }, + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-9" } } }, + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-10" } } } + ], + "maxColumnCount": 3, + "rowHeightMode": "standard" + } + }, + "links": [], + "liveNow": false, + "preferences": { + "layout": { + "kind": "AutoGridLayout", + "spec": { "columnWidthMode": "standard", "items": [], "maxColumnCount": 3, "rowHeightMode": "standard" } + } + }, + "preload": false, + "tags": ["rise", "riscv-runner", "network"], + "timeSettings": { + "autoRefresh": "30s", + "autoRefreshIntervals": ["5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d"], + "fiscalYearStartMonth": 0, + "from": "now-1h", + "hideTimepicker": false, + "timezone": "browser", + "to": "now" + }, + "title": "RISE RISC-V runner — network health (Buckets A+B)", + "variables": [] +} From 8ca84fe4f5298e4ae073439c6fe42e67d996209e Mon Sep 17 00:00:00 2001 From: Ludovic Henry Date: Sun, 10 May 2026 18:25:24 +0000 Subject: [PATCH 03/10] Add TX drops/errs panel + node selector variable - panel-11 (Sentinel: end0 TX drops & errors) mirrors panel-8 against node_network_transmit_{drop,errs}_total. Layout puts panels 7, 8, 11 in the same row so throughput, RX errs, and TX errs sit visually grouped under one NIC sentinel band. - panel-8 renamed to "RX drops & errors" to pair cleanly with the new TX panel. - New `node` query variable (multi, includeAll) using label_values(node) so the dashboard can be filtered per-node. Every panel query now selects on node=~"$node" so the variable actually scopes results. https://claude.ai/code/session_01BgToCb8eDGsrkyTddCtt9Z --- dashboards/network-health-buckets-a-b.json | 155 ++++++++++++++++++--- 1 file changed, 137 insertions(+), 18 deletions(-) diff --git a/dashboards/network-health-buckets-a-b.json b/dashboards/network-health-buckets-a-b.json index 9fbfc90..6dd8d3d 100644 --- a/dashboards/network-health-buckets-a-b.json +++ b/dashboards/network-health-buckets-a-b.json @@ -38,7 +38,7 @@ "kind": "DataQuery", "spec": { "editorMode": "code", - "expr": "rate(node_cpu_seconds_total{mode=\"softirq\"}[1m])", + "expr": "rate(node_cpu_seconds_total{mode=\"softirq\",node=~\"$node\"}[1m])", "legendFormat": "{{node}} cpu={{cpu}}", "range": true }, @@ -73,8 +73,8 @@ "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "line" } }, - "min": 0, "max": 1, + "min": 0, "thresholds": { "mode": "absolute", "steps": [ @@ -113,7 +113,7 @@ "kind": "DataQuery", "spec": { "editorMode": "code", - "expr": "rate(node_softirq_total{type=\"NET_RX\"}[1m])", + "expr": "rate(node_softirq_total{type=\"NET_RX\",node=~\"$node\"}[1m])", "legendFormat": "{{node}} cpu={{cpu}}", "range": true }, @@ -180,7 +180,7 @@ "kind": "DataQuery", "spec": { "editorMode": "code", - "expr": "rate(node_interrupts_total{type=~\".*end0.*\"}[1m])", + "expr": "rate(node_interrupts_total{type=~\".*end0.*\",node=~\"$node\"}[1m])", "legendFormat": "{{node}} cpu={{cpu}} type={{type}}", "range": true }, @@ -247,7 +247,7 @@ "kind": "DataQuery", "spec": { "editorMode": "code", - "expr": "rate(node_netstat_TcpExt_TCPTimeouts[5m])", + "expr": "rate(node_netstat_TcpExt_TCPTimeouts{node=~\"$node\"}[5m])", "legendFormat": "{{node}}", "range": true }, @@ -315,7 +315,7 @@ "kind": "DataQuery", "spec": { "editorMode": "code", - "expr": "rate(node_netstat_Tcp_RetransSegs[5m]) / clamp_min(rate(node_netstat_Tcp_OutSegs[5m]), 1)", + "expr": "rate(node_netstat_Tcp_RetransSegs{node=~\"$node\"}[5m]) / clamp_min(rate(node_netstat_Tcp_OutSegs{node=~\"$node\"}[5m]), 1)", "legendFormat": "{{node}}", "range": true }, @@ -389,7 +389,7 @@ "kind": "DataQuery", "spec": { "editorMode": "code", - "expr": "rate(node_netstat_TcpExt_TCPLostRetransmit[5m])", + "expr": "rate(node_netstat_TcpExt_TCPLostRetransmit{node=~\"$node\"}[5m])", "legendFormat": "{{node}} lost-retransmit", "range": true }, @@ -408,7 +408,7 @@ "kind": "DataQuery", "spec": { "editorMode": "code", - "expr": "rate(node_netstat_TcpExt_TCPSpuriousRTOs[5m])", + "expr": "rate(node_netstat_TcpExt_TCPSpuriousRTOs{node=~\"$node\"}[5m])", "legendFormat": "{{node}} spurious-rto", "range": true }, @@ -476,7 +476,7 @@ "kind": "DataQuery", "spec": { "editorMode": "code", - "expr": "rate(node_network_receive_bytes_total{device=\"end0\"}[1m]) * 8 / 1e6", + "expr": "rate(node_network_receive_bytes_total{device=\"end0\",node=~\"$node\"}[1m]) * 8 / 1e6", "legendFormat": "{{node}} rx", "range": true }, @@ -495,7 +495,7 @@ "kind": "DataQuery", "spec": { "editorMode": "code", - "expr": "rate(node_network_transmit_bytes_total{device=\"end0\"}[1m]) * 8 / 1e6", + "expr": "rate(node_network_transmit_bytes_total{device=\"end0\",node=~\"$node\"}[1m]) * 8 / 1e6", "legendFormat": "{{node}} tx", "range": true }, @@ -569,7 +569,7 @@ "kind": "DataQuery", "spec": { "editorMode": "code", - "expr": "rate(node_network_receive_drop_total{device=\"end0\"}[5m])", + "expr": "rate(node_network_receive_drop_total{device=\"end0\",node=~\"$node\"}[5m])", "legendFormat": "{{node}} rx-drop", "range": true }, @@ -588,7 +588,7 @@ "kind": "DataQuery", "spec": { "editorMode": "code", - "expr": "rate(node_network_receive_errs_total{device=\"end0\"}[5m])", + "expr": "rate(node_network_receive_errs_total{device=\"end0\",node=~\"$node\"}[5m])", "legendFormat": "{{node}} rx-errs", "range": true }, @@ -602,10 +602,10 @@ "transformations": [] } }, - "description": "rate(node_network_receive_{drop,errs}_total{device=\"end0\"}). Should stay flat at 0; any climb reopens H2.", + "description": "rate(node_network_receive_{drop,errs}_total{device=\"end0\"}). Should stay flat at 0; any climb on the receive path reopens H2.", "id": 8, "links": [], - "title": "Sentinel: end0 NIC drops & errors (H2)", + "title": "Sentinel: end0 RX drops & errors (H2)", "vizConfig": { "group": "timeseries", "kind": "VizConfig", @@ -656,7 +656,7 @@ "kind": "DataQuery", "spec": { "editorMode": "code", - "expr": "node_nf_conntrack_entries / node_nf_conntrack_entries_limit", + "expr": "node_nf_conntrack_entries{node=~\"$node\"} / node_nf_conntrack_entries_limit{node=~\"$node\"}", "legendFormat": "{{node}}", "range": true }, @@ -691,8 +691,8 @@ "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "line" } }, - "min": 0, "max": 1, + "min": 0, "thresholds": { "mode": "absolute", "steps": [ @@ -731,7 +731,7 @@ "kind": "DataQuery", "spec": { "editorMode": "code", - "expr": "rate(node_ethtool_irq_tx_path_in_lpi_mode_n{device=\"end0\"}[1m])", + "expr": "rate(node_ethtool_irq_tx_path_in_lpi_mode_n{device=\"end0\",node=~\"$node\"}[1m])", "legendFormat": "{{node}}", "range": true }, @@ -781,6 +781,93 @@ "version": "13.0.1" } } + }, + "panel-11": { + "kind": "Panel", + "spec": { + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "rate(node_network_transmit_drop_total{device=\"end0\",node=~\"$node\"}[5m])", + "legendFormat": "{{node}} tx-drop", + "range": true + }, + "version": "v0" + }, + "refId": "A" + } + }, + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "rate(node_network_transmit_errs_total{device=\"end0\",node=~\"$node\"}[5m])", + "legendFormat": "{{node}} tx-errs", + "range": true + }, + "version": "v0" + }, + "refId": "B" + } + } + ], + "queryOptions": {}, + "transformations": [] + } + }, + "description": "rate(node_network_transmit_{drop,errs}_total{device=\"end0\"}). Should stay flat at 0; any climb on the transmit path means egress trouble.", + "id": 11, + "links": [], + "title": "Sentinel: end0 TX drops & errors", + "vizConfig": { + "group": "timeseries", + "kind": "VizConfig", + "spec": { + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", + "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", "showValues": false, "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "min": 0, + "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] }, + "unit": "ops" + }, + "overrides": [] + }, + "options": { + "annotations": { "clustering": -1, "multiLane": false }, + "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } + } + }, + "version": "13.0.1" + } + } } }, "layout": { @@ -796,6 +883,7 @@ { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-6" } } }, { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-7" } } }, { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-8" } } }, + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-11" } } }, { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-9" } } }, { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-10" } } } ], @@ -823,5 +911,36 @@ "to": "now" }, "title": "RISE RISC-V runner — network health (Buckets A+B)", - "variables": [] + "variables": [ + { + "kind": "QueryVariable", + "spec": { + "allowCustomValue": false, + "current": { "text": ["$__all"], "value": ["$__all"] }, + "definition": "label_values(node)", + "hide": "dontHide", + "includeAll": true, + "label": "Node", + "multi": true, + "name": "node", + "options": [], + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "qryType": 1, + "query": "label_values(node)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "version": "v0" + }, + "refresh": "onDashboardLoad", + "regex": "", + "regexApplyTo": "value", + "skipUrlSync": false, + "sort": "alphabeticalAsc" + } + } + ] } From afc48e729f39f298ca185b10d9ed9bc0db4e2100 Mon Sep 17 00:00:00 2001 From: Ludovic Henry Date: Sun, 10 May 2026 18:31:08 +0000 Subject: [PATCH 04/10] Fix node_softirqs_total typo + widen netstat.fields filter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two reasons panels in the dashboard were empty: 1. Dashboard query typo. The softirq panel queried node_softirq_total (singular) but node_exporter exposes node_softirqs_total (plural). 2. node_exporter's --collector.netstat.fields default regex excludes TCPLostRetransmit and TCPSpuriousRTOs (and a number of other TcpExt_* fields). The collector reads them from /proc/net/netstat but drops them before exposing. Setting the filter to ^.*$ exposes the full set; cardinality bump on a runner is a few dozen series per node, negligible. The remaining two empty panels (NET_RX softirq cross-check, EEE/LPI) are expected to start populating once a runner is reprovisioned with this scw.py — verifiable on the node via curl -s 127.0.0.1:9100/metrics | grep -E '^node_(softirqs|interrupts|ethtool)_' https://claude.ai/code/session_01BgToCb8eDGsrkyTddCtt9Z --- dashboards/network-health-buckets-a-b.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dashboards/network-health-buckets-a-b.json b/dashboards/network-health-buckets-a-b.json index 6dd8d3d..0338e44 100644 --- a/dashboards/network-health-buckets-a-b.json +++ b/dashboards/network-health-buckets-a-b.json @@ -113,7 +113,7 @@ "kind": "DataQuery", "spec": { "editorMode": "code", - "expr": "rate(node_softirq_total{type=\"NET_RX\",node=~\"$node\"}[1m])", + "expr": "rate(node_softirqs_total{type=\"NET_RX\",node=~\"$node\"}[1m])", "legendFormat": "{{node}} cpu={{cpu}}", "range": true }, From f2fe6e4b2d1e0282ad7b7a2c4dc698b6fc653e80 Mon Sep 17 00:00:00 2001 From: Ludovic Henry Date: Sun, 10 May 2026 19:00:42 +0000 Subject: [PATCH 05/10] Fix EEE/LPI metric name for st_gmac driver The light_dwmac_eth driver names the counter irq_tx_path_in_lpi_mode_n, but st_gmac (the driver on the runner that exported the metrics) names it irq_transmitted_path_in_lpi_mode_n. The H7 sentinel panel was querying the light_dwmac form and getting no data. Use the st_gmac form, which is the form actually exposed on the fleet. https://claude.ai/code/session_01BgToCb8eDGsrkyTddCtt9Z --- dashboards/network-health-buckets-a-b.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dashboards/network-health-buckets-a-b.json b/dashboards/network-health-buckets-a-b.json index 0338e44..6e90caa 100644 --- a/dashboards/network-health-buckets-a-b.json +++ b/dashboards/network-health-buckets-a-b.json @@ -731,7 +731,7 @@ "kind": "DataQuery", "spec": { "editorMode": "code", - "expr": "rate(node_ethtool_irq_tx_path_in_lpi_mode_n{device=\"end0\",node=~\"$node\"}[1m])", + "expr": "rate(node_ethtool_irq_transmitted_path_in_lpi_mode_n{device=\"end0\",node=~\"$node\"}[1m])", "legendFormat": "{{node}}", "range": true }, From d0154283f4338a5b0bc3cc6688b3341514de00b0 Mon Sep 17 00:00:00 2001 From: Ludovic Henry Date: Sun, 10 May 2026 19:17:52 +0000 Subject: [PATCH 06/10] Fix softirqs metric name and IRQ label selector Two unrelated query fixes against current node_exporter (1.11.1): 1. node_exporter renamed the softirqs metric to node_softirqs_functions_total. The H10 NET_RX panel was querying the old node_softirqs_total. (Raw data already confirms H10: CPU0 has 1.55M NET_RX softirqs vs ~5K on each other CPU.) 2. The interrupts collector exposes the device name under the `devices` label, not `type` (which is the IRQ number). Filter end0 IRQs via devices=~".*end0.*"; legend now shows cpu, dev, and irq for clarity. https://claude.ai/code/session_01BgToCb8eDGsrkyTddCtt9Z --- dashboards/network-health-buckets-a-b.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dashboards/network-health-buckets-a-b.json b/dashboards/network-health-buckets-a-b.json index 6e90caa..b51b217 100644 --- a/dashboards/network-health-buckets-a-b.json +++ b/dashboards/network-health-buckets-a-b.json @@ -113,7 +113,7 @@ "kind": "DataQuery", "spec": { "editorMode": "code", - "expr": "rate(node_softirqs_total{type=\"NET_RX\",node=~\"$node\"}[1m])", + "expr": "rate(node_softirqs_functions_total{type=\"NET_RX\",node=~\"$node\"}[1m])", "legendFormat": "{{node}} cpu={{cpu}}", "range": true }, @@ -180,8 +180,8 @@ "kind": "DataQuery", "spec": { "editorMode": "code", - "expr": "rate(node_interrupts_total{type=~\".*end0.*\",node=~\"$node\"}[1m])", - "legendFormat": "{{node}} cpu={{cpu}} type={{type}}", + "expr": "rate(node_interrupts_total{devices=~\".*end0.*\",node=~\"$node\"}[1m])", + "legendFormat": "{{node}} cpu={{cpu}} dev={{devices}} irq={{type}}", "range": true }, "version": "v0" From e60a4471e6dadf1f8588833a73dd42d0dcc7d573 Mon Sep 17 00:00:00 2001 From: Ludovic Henry Date: Sun, 10 May 2026 19:26:59 +0000 Subject: [PATCH 07/10] Reorganise dashboard into per-hypothesis rows; add H1 + H5 panels MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Six rows: H1 Bad Fastly POP / IPv6 path 3 new panels (12-14) H5 Scaleway WAN egress contention 2 new panels (15-16) H7 EEE / LPI micro-stalls (fallback) panel 10 H9 TCP-stack health panels 4-6 H10 Single-core CPU0 softirq saturation panels 1-3 Sentinels (H2/H3, refuted) panels 7, 8, 11, 9 H2 (NIC errors), H3 (conntrack), H4 (PMTU), H6 (ASN throttle), H8 (in-host contention) don't get their own rows: H2/H3 are sentinels, H4/H6/H8 have no on-host Prometheus metric (live capture or off-host probe only). H1/H5 panels read raw_github_probe_seconds, _bytes_per_second (faceted by remote_ip/target) and runner_dns_resolved_ip; they populate once raw_github_probe.py and dns_probe.py are running on the node, otherwise the panels are empty placeholders waiting for the probe data. Layout uses RowsLayout containing one RowsLayoutRow per hypothesis, each wrapping an AutoGridLayout with maxColumnCount=3 — schema matches the v2 sample exported by Scaleway-managed Grafana. https://claude.ai/code/session_01BgToCb8eDGsrkyTddCtt9Z --- dashboards/network-health-buckets-a-b.json | 529 ++++++++++++++++++++- 1 file changed, 506 insertions(+), 23 deletions(-) diff --git a/dashboards/network-health-buckets-a-b.json b/dashboards/network-health-buckets-a-b.json index b51b217..f816c43 100644 --- a/dashboards/network-health-buckets-a-b.json +++ b/dashboards/network-health-buckets-a-b.json @@ -127,7 +127,7 @@ "transformations": [] } }, - "description": "rate(node_softirq_total{type=\"NET_RX\"}). Cross-check for the CPU softirq% panel. Needs --collector.softirqs.", + "description": "rate(node_softirqs_functions_total{type=\"NET_RX\"}). Cross-check for the CPU softirq% panel. Needs --collector.softirqs.", "id": 2, "links": [], "title": "H10: NET_RX softirq deliveries/sec per CPU", @@ -194,7 +194,7 @@ "transformations": [] } }, - "description": "rate(node_interrupts_total{type=~\".*end0.*\"}). Single-queue light_dwmac_eth pins all end0 IRQs to one CPU; this should structurally show one line > 0 and the rest at 0. Needs --collector.interrupts.", + "description": "rate(node_interrupts_total{devices=~\".*end0.*\"}). Single-queue NIC pins all end0 IRQs to one CPU; this should structurally show one line > 0 and the rest at 0. Needs --collector.interrupts.", "id": 3, "links": [], "title": "H10: end0 hard IRQs/sec by CPU", @@ -745,10 +745,10 @@ "transformations": [] } }, - "description": "rate(node_ethtool_irq_tx_path_in_lpi_mode_n). LPI being entered constantly mostly means link is idle; only interesting if a slow-run window correlates with a marked change. Needs --collector.ethtool.", + "description": "rate(node_ethtool_irq_transmitted_path_in_lpi_mode_n). LPI being entered constantly mostly means link is idle; only interesting if a slow-run window correlates with a marked change. Needs --collector.ethtool.", "id": 10, "links": [], - "title": "Sentinel: EEE / LPI enter rate (H7 fallback)", + "title": "H7: EEE / LPI enter rate (fallback)", "vizConfig": { "group": "timeseries", "kind": "VizConfig", @@ -868,27 +868,510 @@ "version": "13.0.1" } } + }, + "panel-12": { + "kind": "Panel", + "spec": { + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "raw_github_probe_seconds{target=\"raw.githubusercontent.com\",node=~\"$node\"}", + "legendFormat": "{{node}} ip={{remote_ip}}", + "range": true + }, + "version": "v0" + }, + "refId": "A" + } + } + ], + "queryOptions": {}, + "transformations": [] + } + }, + "description": "raw_github_probe_seconds for the Fastly target, faceted by remote_ip. A specific Fastly IP cluster being slow while others are fast = H1 (POP steering).", + "id": 12, + "links": [], + "title": "H1: raw.githubusercontent.com — download time by Fastly IP", + "vizConfig": { + "group": "timeseries", + "kind": "VizConfig", + "spec": { + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", + "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", "showValues": false, "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "min": 0, + "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] }, + "unit": "s" + }, + "overrides": [] + }, + "options": { + "annotations": { "clustering": -1, "multiLane": false }, + "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } + } + }, + "version": "13.0.1" + } + } + }, + "panel-13": { + "kind": "Panel", + "spec": { + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "raw_github_probe_bytes_per_second{target=\"raw.githubusercontent.com\",node=~\"$node\"}", + "legendFormat": "{{node}} ip={{remote_ip}}", + "range": true + }, + "version": "v0" + }, + "refId": "A" + } + } + ], + "queryOptions": {}, + "transformations": [] + } + }, + "description": "raw_github_probe_bytes_per_second for the Fastly target, faceted by remote_ip.", + "id": 13, + "links": [], + "title": "H1: raw.githubusercontent.com — throughput by Fastly IP", + "vizConfig": { + "group": "timeseries", + "kind": "VizConfig", + "spec": { + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", + "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", "showValues": false, "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "min": 0, + "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] }, + "unit": "Bps" + }, + "overrides": [] + }, + "options": { + "annotations": { "clustering": -1, "multiLane": false }, + "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } + } + }, + "version": "13.0.1" + } + } + }, + "panel-14": { + "kind": "Panel", + "spec": { + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "runner_dns_resolved_ip{host=\"raw.githubusercontent.com\",node=~\"$node\"}", + "legendFormat": "{{node}} ip={{ip}}", + "range": true + }, + "version": "v0" + }, + "refId": "A" + } + } + ], + "queryOptions": {}, + "transformations": [] + } + }, + "description": "runner_dns_resolved_ip — info metric (1 per resolved IP). Shows which Fastly IPs each node sees over time. Cache-region flips show up as the set of IP lines changing.", + "id": 14, + "links": [], + "title": "H1: DNS resolutions for raw.githubusercontent.com", + "vizConfig": { + "group": "timeseries", + "kind": "VizConfig", + "spec": { + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", + "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 30, "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "stepAfter", "lineWidth": 1, "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", "showValues": false, "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "min": 0, + "max": 1, + "thresholds": { "mode": "absolute", "steps": [ { "color": "blue", "value": 0 } ] }, + "unit": "short" + }, + "overrides": [] + }, + "options": { + "annotations": { "clustering": -1, "multiLane": false }, + "legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } + } + }, + "version": "13.0.1" + } + } + }, + "panel-15": { + "kind": "Panel", + "spec": { + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "raw_github_probe_seconds{target=\"raw.githubusercontent.com\",node=~\"$node\"}", + "legendFormat": "{{node}} fastly", + "range": true + }, + "version": "v0" + }, + "refId": "A" + } + }, + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "raw_github_probe_seconds{target=\"cloudflare-control\",node=~\"$node\"}", + "legendFormat": "{{node}} cloudflare", + "range": true + }, + "version": "v0" + }, + "refId": "B" + } + } + ], + "queryOptions": {}, + "transformations": [] + } + }, + "description": "raw_github_probe_seconds for both targets. When Fastly sags but Cloudflare stays flat → H9 (path-specific). When both sag together → H5 (Scaleway WAN egress).", + "id": 15, + "links": [], + "title": "H5: Probe download time — Fastly vs Cloudflare", + "vizConfig": { + "group": "timeseries", + "kind": "VizConfig", + "spec": { + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", + "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", "showValues": false, "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "min": 0, + "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] }, + "unit": "s" + }, + "overrides": [] + }, + "options": { + "annotations": { "clustering": -1, "multiLane": false }, + "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } + } + }, + "version": "13.0.1" + } + } + }, + "panel-16": { + "kind": "Panel", + "spec": { + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "raw_github_probe_bytes_per_second{target=\"raw.githubusercontent.com\",node=~\"$node\"}", + "legendFormat": "{{node}} fastly", + "range": true + }, + "version": "v0" + }, + "refId": "A" + } + }, + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "raw_github_probe_bytes_per_second{target=\"cloudflare-control\",node=~\"$node\"}", + "legendFormat": "{{node}} cloudflare", + "range": true + }, + "version": "v0" + }, + "refId": "B" + } + } + ], + "queryOptions": {}, + "transformations": [] + } + }, + "description": "raw_github_probe_bytes_per_second for both targets. Same correlation logic as the time panel.", + "id": 16, + "links": [], + "title": "H5: Probe throughput — Fastly vs Cloudflare", + "vizConfig": { + "group": "timeseries", + "kind": "VizConfig", + "spec": { + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", + "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", "showValues": false, "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "min": 0, + "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] }, + "unit": "Bps" + }, + "overrides": [] + }, + "options": { + "annotations": { "clustering": -1, "multiLane": false }, + "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } + } + }, + "version": "13.0.1" + } + } } }, "layout": { - "kind": "AutoGridLayout", + "kind": "RowsLayout", "spec": { - "columnWidthMode": "standard", - "items": [ - { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-1" } } }, - { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-2" } } }, - { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-3" } } }, - { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-4" } } }, - { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-5" } } }, - { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-6" } } }, - { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-7" } } }, - { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-8" } } }, - { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-11" } } }, - { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-9" } } }, - { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-10" } } } - ], - "maxColumnCount": 3, - "rowHeightMode": "standard" + "rows": [ + { + "kind": "RowsLayoutRow", + "spec": { + "collapse": false, + "title": "H1 — Bad Fastly POP / IPv6 path", + "layout": { + "kind": "AutoGridLayout", + "spec": { + "columnWidthMode": "standard", + "items": [ + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-12" } } }, + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-13" } } }, + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-14" } } } + ], + "maxColumnCount": 3, + "rowHeightMode": "standard" + } + } + } + }, + { + "kind": "RowsLayoutRow", + "spec": { + "collapse": false, + "title": "H5 — Scaleway WAN egress contention", + "layout": { + "kind": "AutoGridLayout", + "spec": { + "columnWidthMode": "standard", + "items": [ + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-15" } } }, + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-16" } } } + ], + "maxColumnCount": 3, + "rowHeightMode": "standard" + } + } + } + }, + { + "kind": "RowsLayoutRow", + "spec": { + "collapse": false, + "title": "H7 — EEE / LPI micro-stalls (fallback)", + "layout": { + "kind": "AutoGridLayout", + "spec": { + "columnWidthMode": "standard", + "items": [ + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-10" } } } + ], + "maxColumnCount": 3, + "rowHeightMode": "standard" + } + } + } + }, + { + "kind": "RowsLayoutRow", + "spec": { + "collapse": false, + "title": "H9 — TCP-stack health (Scaleway↔Fastly path)", + "layout": { + "kind": "AutoGridLayout", + "spec": { + "columnWidthMode": "standard", + "items": [ + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-4" } } }, + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-5" } } }, + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-6" } } } + ], + "maxColumnCount": 3, + "rowHeightMode": "standard" + } + } + } + }, + { + "kind": "RowsLayoutRow", + "spec": { + "collapse": false, + "title": "H10 — Single-core CPU0 softirq saturation", + "layout": { + "kind": "AutoGridLayout", + "spec": { + "columnWidthMode": "standard", + "items": [ + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-1" } } }, + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-2" } } }, + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-3" } } } + ], + "maxColumnCount": 3, + "rowHeightMode": "standard" + } + } + } + }, + { + "kind": "RowsLayoutRow", + "spec": { + "collapse": false, + "title": "Sentinels (H2 / H3 refuted, monitoring)", + "layout": { + "kind": "AutoGridLayout", + "spec": { + "columnWidthMode": "standard", + "items": [ + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-7" } } }, + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-8" } } }, + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-11" } } }, + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-9" } } } + ], + "maxColumnCount": 3, + "rowHeightMode": "standard" + } + } + } + } + ] } }, "links": [], @@ -910,13 +1393,13 @@ "timezone": "browser", "to": "now" }, - "title": "RISE RISC-V runner — network health (Buckets A+B)", + "title": "RISE RISC-V runner — network health", "variables": [ { "kind": "QueryVariable", "spec": { "allowCustomValue": false, - "current": { "text": ["$__all"], "value": ["$__all"] }, + "current": { "text": "All", "value": ["$__all"] }, "definition": "label_values(node)", "hide": "dontHide", "includeAll": true, From f3dcae6d10be5a7d06afedfc70eaa42faefb8349 Mon Sep 17 00:00:00 2001 From: Ludovic Henry Date: Sun, 10 May 2026 19:37:09 +0000 Subject: [PATCH 08/10] Add NET_TX softirq panel to H10 row Mirrors the existing NET_RX softirq panel against node_softirqs_functions_total{type="NET_TX"}. Slots into the H10 row between NET_RX softirq and end0 hard IRQs so RX/TX softirq pressure sit side-by-side, with the hard-IRQ panel on the next visual row. https://claude.ai/code/session_01BgToCb8eDGsrkyTddCtt9Z --- dashboards/network-health-buckets-a-b.json | 68 ++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/dashboards/network-health-buckets-a-b.json b/dashboards/network-health-buckets-a-b.json index f816c43..3a0b516 100644 --- a/dashboards/network-health-buckets-a-b.json +++ b/dashboards/network-health-buckets-a-b.json @@ -1247,6 +1247,73 @@ "version": "13.0.1" } } + }, + "panel-17": { + "kind": "Panel", + "spec": { + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "rate(node_softirqs_functions_total{type=\"NET_TX\",node=~\"$node\"}[1m])", + "legendFormat": "{{node}} cpu={{cpu}}", + "range": true + }, + "version": "v0" + }, + "refId": "A" + } + } + ], + "queryOptions": {}, + "transformations": [] + } + }, + "description": "rate(node_softirqs_functions_total{type=\"NET_TX\"}). TX-side softirq pressure per CPU; pair with the NET_RX panel for the full softirq picture on the link.", + "id": 17, + "links": [], + "title": "H10: NET_TX softirq deliveries/sec per CPU", + "vizConfig": { + "group": "timeseries", + "kind": "VizConfig", + "spec": { + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", + "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", "showValues": false, "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] }, + "unit": "ops" + }, + "overrides": [] + }, + "options": { + "annotations": { "clustering": -1, "multiLane": false }, + "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } + } + }, + "version": "13.0.1" + } + } } }, "layout": { @@ -1342,6 +1409,7 @@ "items": [ { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-1" } } }, { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-2" } } }, + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-17" } } }, { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-3" } } } ], "maxColumnCount": 3, From c8d10ef8856db723556f528add2b7e606e0778d6 Mon Sep 17 00:00:00 2001 From: Ludovic Henry Date: Sun, 10 May 2026 20:24:14 +0000 Subject: [PATCH 09/10] Refactor probe scripts to standalone files The probes were embedded as bash heredocs inside SETUP_SCRIPT; move the canonical sources to scripts/probes/raw_github_probe.py and scripts/probes/dns_probe.py and have run_setup read and substitute them in. Behaviour identical; SETUP_SCRIPT shrinks by ~140 lines. https://claude.ai/code/session_01BgToCb8eDGsrkyTddCtt9Z --- scripts/probes/dns_probe.py | 58 +++++++++++ scripts/probes/raw_github_probe.py | 77 ++++++++++++++ scripts/scw.py | 157 +++-------------------------- 3 files changed, 147 insertions(+), 145 deletions(-) create mode 100644 scripts/probes/dns_probe.py create mode 100644 scripts/probes/raw_github_probe.py diff --git a/scripts/probes/dns_probe.py b/scripts/probes/dns_probe.py new file mode 100644 index 0000000..62d0a5d --- /dev/null +++ b/scripts/probes/dns_probe.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python3 +"""Snapshot which IPs raw.githubusercontent.com currently resolves to, for +the node_exporter textfile collector. + +Resolves via socket.getaddrinfo (the libc resolver curl uses), so the IPs +we record match the customer's real traffic path. Used to correlate slow +windows with Fastly cache-region or POP flips (H1). +""" + +import os +import socket +import tempfile +from pathlib import Path + +OUT = Path("/var/lib/node_exporter/textfile_collector/dns_probe.prom") +HOST = "raw.githubusercontent.com" + +HEADER = """\ +# HELP runner_dns_resolved_ip Info metric: 1 per IP currently resolved for HOST. +# TYPE runner_dns_resolved_ip gauge +# HELP runner_dns_resolved_ip_count Number of IPs returned for HOST. +# TYPE runner_dns_resolved_ip_count gauge +""" + + +def resolve(host: str) -> list[str]: + ips: set[str] = set() + for family in (socket.AF_INET, socket.AF_INET6): + try: + ips.update(info[4][0] for info in socket.getaddrinfo(host, None, family, socket.SOCK_STREAM)) + except socket.gaierror: + pass + return sorted(ips) + + +def write_atomic(path: Path, content: str) -> None: + """Write content via tempfile + os.replace so node_exporter never sees a half-written file.""" + path.parent.mkdir(parents=True, exist_ok=True) + fd, tmp = tempfile.mkstemp(dir=path.parent, prefix="." + path.name + ".") + try: + with os.fdopen(fd, "w") as f: + f.write(content) + os.replace(tmp, path) + except BaseException: + Path(tmp).unlink(missing_ok=True) + raise + + +def main() -> None: + ips = resolve(HOST) + body = HEADER + "".join( + f'runner_dns_resolved_ip{{host="{HOST}",ip="{ip}"}} 1\n' for ip in ips + ) + f'runner_dns_resolved_ip_count{{host="{HOST}"}} {len(ips)}\n' + write_atomic(OUT, body) + + +if __name__ == "__main__": + main() diff --git a/scripts/probes/raw_github_probe.py b/scripts/probes/raw_github_probe.py new file mode 100644 index 0000000..246d2d3 --- /dev/null +++ b/scripts/probes/raw_github_probe.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +"""Probe raw.githubusercontent.com download speed for the node_exporter +textfile collector. + +Hits the Fastly target the customer's CI actually downloads from and a +non-Fastly comparison endpoint. When the Fastly target sags but the +control stays flat → H9 (Scaleway↔Fastly path). When both sag → H5 +(Scaleway WAN egress). + +curl is shelled out so we get %{remote_ip} reflecting the IP libcurl +actually connected to — matches the customer's real traffic path more +faithfully than socket.gethostbyname would. +""" + +import os +import subprocess +import tempfile +from pathlib import Path + +OUT = Path("/var/lib/node_exporter/textfile_collector/raw_github_probe.prom") + +TARGETS = [ + ("raw.githubusercontent.com", + "https://raw.githubusercontent.com/usnistgov/ACVP-Server/master/README.md"), + ("cloudflare-control", + "https://speed.cloudflare.com/__down?bytes=1048576"), +] + +CURL_FORMAT = "%{time_total} %{speed_download} %{remote_ip} %{exitcode}\n" + +HEADER = """\ +# HELP raw_github_probe_seconds Wallclock to download a fixed test artefact. +# TYPE raw_github_probe_seconds gauge +# HELP raw_github_probe_bytes_per_second Average download throughput in bytes/sec. +# TYPE raw_github_probe_bytes_per_second gauge +# HELP raw_github_probe_curl_exit_code Curl exit code; 0 on success. +# TYPE raw_github_probe_curl_exit_code gauge +""" + + +def probe(target: str, url: str) -> str: + """Run one curl, return Prom-formatted lines for the result.""" + try: + result = subprocess.run( + ["curl", "-o", "/dev/null", "-s", "--max-time", "30", "-w", CURL_FORMAT, url], + capture_output=True, text=True, timeout=35, + ) + fields = (result.stdout.strip() or "0 0 unknown 99").split() + except (subprocess.TimeoutExpired, FileNotFoundError): + fields = ["0", "0", "unknown", "99"] + seconds, bps, ip, code = (fields + ["0", "0", "unknown", "99"])[:4] + return ( + f'raw_github_probe_seconds{{target="{target}",remote_ip="{ip}"}} {seconds}\n' + f'raw_github_probe_bytes_per_second{{target="{target}"}} {bps}\n' + f'raw_github_probe_curl_exit_code{{target="{target}"}} {code}\n' + ) + + +def write_atomic(path: Path, content: str) -> None: + """Write content via tempfile + os.replace so node_exporter never sees a half-written file.""" + path.parent.mkdir(parents=True, exist_ok=True) + fd, tmp = tempfile.mkstemp(dir=path.parent, prefix="." + path.name + ".") + try: + with os.fdopen(fd, "w") as f: + f.write(content) + os.replace(tmp, path) + except BaseException: + Path(tmp).unlink(missing_ok=True) + raise + + +def main() -> None: + write_atomic(OUT, HEADER + "".join(probe(name, url) for name, url in TARGETS)) + + +if __name__ == "__main__": + main() diff --git a/scripts/scw.py b/scripts/scw.py index ff78161..62a1f45 100644 --- a/scripts/scw.py +++ b/scripts/scw.py @@ -867,155 +867,17 @@ def is_ready(res): ############################################################################### ## Install probe scripts (textfile collector) -# Two scripts cover the metrics in the debugging plan that no built-in -# node_exporter collector exposes: -# - raw_github_probe.py: end-to-end download timing against -# raw.githubusercontent.com plus a non-Fastly comparison target. The -# headline metric for the slow-CI investigation. -# - dns_probe.py: which Fastly IPs raw.githubusercontent.com is currently -# resolving to, so we can correlate slow windows with cache-region flips. -# -# Implemented in Python (stdlib only). curl is shelled out for the -# raw_github probe so we get %{remote_ip} reflecting the IP libcurl -# actually connected to (matches the customer's traffic path more -# faithfully than what socket.gethostbyname would tell us). - -cat <<'SCRIPT_EOF' | sudo tee /usr/local/bin/raw_github_probe.py >/dev/null -#!/usr/bin/env python3 -# Synthetic probe: download a fixed test artefact from -# raw.githubusercontent.com and a non-Fastly comparison target, -# emit timing metrics for the node_exporter textfile collector. -import os -import subprocess -import tempfile -from pathlib import Path - -OUT = Path("/var/lib/node_exporter/textfile_collector/raw_github_probe.prom") - -TARGETS = [ - # Fastly target: small stable file in the same repo as the customer's - # slow downloads (they fetch from usnistgov/ACVP-Server). - ("raw.githubusercontent.com", - "https://raw.githubusercontent.com/usnistgov/ACVP-Server/master/README.md"), - # Non-Fastly comparison: Cloudflare's well-known speed-test endpoint. - # When the Fastly target sags but this stays flat, the issue is on the - # Scaleway-Fastly path (H9), not Scaleway WAN egress in general (H5). - ("cloudflare-control", - "https://speed.cloudflare.com/__down?bytes=1048576"), -] - - -def probe(target: str, url: str) -> list[str]: - try: - result = subprocess.run( - [ - "curl", "-o", "/dev/null", "-s", "--max-time", "30", - "-w", "%{time_total} %{speed_download} %{remote_ip} %{exitcode}\n", - url, - ], - capture_output=True, text=True, timeout=35, - ) - fields = (result.stdout.strip() or "0 0 unknown 99").split() - except (subprocess.TimeoutExpired, FileNotFoundError): - fields = ["0", "0", "unknown", "99"] - fields = (fields + ["0", "0", "unknown", "99"])[:4] - time_total, speed, remote_ip, exit_code = fields - return [ - f'raw_github_probe_seconds{{target="{target}",remote_ip="{remote_ip}"}} {time_total}', - f'raw_github_probe_bytes_per_second{{target="{target}"}} {speed}', - f'raw_github_probe_curl_exit_code{{target="{target}"}} {exit_code}', - ] - - -def main() -> None: - lines = [ - "# HELP raw_github_probe_seconds Wallclock to download a fixed test artefact.", - "# TYPE raw_github_probe_seconds gauge", - "# HELP raw_github_probe_bytes_per_second Average download throughput in bytes/sec.", - "# TYPE raw_github_probe_bytes_per_second gauge", - "# HELP raw_github_probe_curl_exit_code Curl exit code; 0 on success.", - "# TYPE raw_github_probe_curl_exit_code gauge", - ] - for target, url in TARGETS: - lines.extend(probe(target, url)) - - OUT.parent.mkdir(parents=True, exist_ok=True) - fd, tmp = tempfile.mkstemp(dir=str(OUT.parent), prefix=".raw_github_probe.") - try: - with os.fdopen(fd, "w") as f: - f.write("\n".join(lines) + "\n") - os.replace(tmp, OUT) - except Exception: - try: - os.unlink(tmp) - except FileNotFoundError: - pass - raise - +# Sources live in scripts/probes/; substituted in by run_setup(). -if __name__ == "__main__": - main() -SCRIPT_EOF +cat <<'PROBE_EOF' | sudo tee /usr/local/bin/raw_github_probe.py >/dev/null +@@RAW_GITHUB_PROBE_PY@@ +PROBE_EOF sudo chmod 0755 /usr/local/bin/raw_github_probe.py sudo chown root:root /usr/local/bin/raw_github_probe.py -cat <<'SCRIPT_EOF' | sudo tee /usr/local/bin/dns_probe.py >/dev/null -#!/usr/bin/env python3 -# DNS resolution snapshot for raw.githubusercontent.com — emit info-style -# metrics so we can see which Fastly IPs each node sees over time. -# -# Resolves via socket.getaddrinfo, which uses the same resolver libc -# would use, so the IPs we record are the same ones curl/the runner -# agent would actually connect to. -import os -import socket -import tempfile -from pathlib import Path - -OUT = Path("/var/lib/node_exporter/textfile_collector/dns_probe.prom") -HOST = "raw.githubusercontent.com" - - -def resolve(host: str) -> list[str]: - ips: set[str] = set() - for family in (socket.AF_INET, socket.AF_INET6): - try: - for info in socket.getaddrinfo(host, None, family, socket.SOCK_STREAM): - ips.add(info[4][0]) - except socket.gaierror: - pass - return sorted(ips) - - -def main() -> None: - ips = resolve(HOST) - lines = [ - "# HELP runner_dns_resolved_ip Info metric: 1 per IP currently resolved for HOST.", - "# TYPE runner_dns_resolved_ip gauge", - "# HELP runner_dns_resolved_ip_count Number of IPs returned for HOST.", - "# TYPE runner_dns_resolved_ip_count gauge", - ] - for ip in ips: - lines.append(f'runner_dns_resolved_ip{{host="{HOST}",ip="{ip}"}} 1') - lines.append(f'runner_dns_resolved_ip_count{{host="{HOST}"}} {len(ips)}') - - OUT.parent.mkdir(parents=True, exist_ok=True) - fd, tmp = tempfile.mkstemp(dir=str(OUT.parent), prefix=".dns_probe.") - try: - with os.fdopen(fd, "w") as f: - f.write("\n".join(lines) + "\n") - os.replace(tmp, OUT) - except Exception: - try: - os.unlink(tmp) - except FileNotFoundError: - pass - raise - - -if __name__ == "__main__": - main() -SCRIPT_EOF +cat <<'PROBE_EOF' | sudo tee /usr/local/bin/dns_probe.py >/dev/null +@@DNS_PROBE_PY@@ +PROBE_EOF sudo chmod 0755 /usr/local/bin/dns_probe.py sudo chown root:root /usr/local/bin/dns_probe.py @@ -1269,8 +1131,13 @@ def create_cockpit_metrics_push_token(name: str) -> Token: def setup_runner(ssh, runner, pn): cockpit_metrics_ds = get_or_create_cockpit_metrics_data_source() cockpit_metrics_token = create_cockpit_metrics_push_token(f"{runner}-metrics-token") + probes_dir = os.path.join(os.path.dirname(__file__), "probes") + raw_github_probe_py = open(os.path.join(probes_dir, "raw_github_probe.py")).read() + dns_probe_py = open(os.path.join(probes_dir, "dns_probe.py")).read() script = SETUP_SCRIPT.replace("@@COCKPIT_METRICS_PUSH_URL@@", cockpit_metrics_ds.url) \ .replace("@@COCKPIT_METRICS_TOKEN@@", cockpit_metrics_token.secret_key) \ + .replace("@@RAW_GITHUB_PROBE_PY@@", raw_github_probe_py) \ + .replace("@@DNS_PROBE_PY@@", dns_probe_py) \ #FIXME(pn): enable private address again # .replace("@@PN_IP@@", pn.ip) # .replace("@@PN_VLAN_ID@@", pn.vlan_id) From 5289a3fc02b9743d3635e82b5277821107052119 Mon Sep 17 00:00:00 2001 From: Ludovic Henry Date: Sun, 10 May 2026 22:04:36 +0000 Subject: [PATCH 10/10] Add three diagnostic panels for next TCPTimeouts incident MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit H9: Spurious-RTO ratio (panel 18) TCPSpuriousRTOs / TCPTimeouts. ~1.0 means most RTOs are spurious (network was fine, kernel was impatient — rto_min tuning); ~0 means genuine packet loss. H9: Fast recovery vs RTO (panel 19) TCPSackRecovery + TCPRenoRecovery (cheap, cwnd preserved) vs TCPTimeouts (expensive, cwnd collapsed). When RTO dominates the loss pattern is severe/bursty. H10: softnet drops & NAPI squeezes per CPU (panel 20) rate(node_softnet_dropped_total) and _times_squeezed_total. If times_squeezed > 0 on the CPU that handles end0 IRQs, the NAPI poll exhausted its budget and the NIC dropped at the ring buffer before TCP saw it — connects H10 (CPU0 softirq saturation) to H9 (downstream TCP timeouts). https://claude.ai/code/session_01BgToCb8eDGsrkyTddCtt9Z --- dashboards/network-health-buckets-a-b.json | 269 ++++++++++++++++++++- 1 file changed, 267 insertions(+), 2 deletions(-) diff --git a/dashboards/network-health-buckets-a-b.json b/dashboards/network-health-buckets-a-b.json index 3a0b516..9117752 100644 --- a/dashboards/network-health-buckets-a-b.json +++ b/dashboards/network-health-buckets-a-b.json @@ -1314,6 +1314,268 @@ "version": "13.0.1" } } + }, + "panel-18": { + "kind": "Panel", + "spec": { + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "rate(node_netstat_TcpExt_TCPSpuriousRTOs{node=~\"$node\"}[5m]) / clamp_min(rate(node_netstat_TcpExt_TCPTimeouts{node=~\"$node\"}[5m]), 1)", + "legendFormat": "{{node}}", + "range": true + }, + "version": "v0" + }, + "refId": "A" + } + } + ], + "queryOptions": {}, + "transformations": [] + } + }, + "description": "TCPSpuriousRTOs / TCPTimeouts. Close to 1.0 = most RTOs were spurious (network was fine, kernel was impatient — RTO-min tuning territory). Close to 0 = real loss.", + "id": 18, + "links": [], + "title": "H9: Spurious-RTO ratio (spurious / total RTO)", + "vizConfig": { + "group": "timeseries", + "kind": "VizConfig", + "spec": { + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", + "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", "showValues": false, "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "min": 0, + "max": 1, + "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] }, + "unit": "percentunit" + }, + "overrides": [] + }, + "options": { + "annotations": { "clustering": -1, "multiLane": false }, + "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } + } + }, + "version": "13.0.1" + } + } + }, + "panel-19": { + "kind": "Panel", + "spec": { + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "rate(node_netstat_TcpExt_TCPSackRecovery{node=~\"$node\"}[5m])", + "legendFormat": "{{node}} sack-recovery", + "range": true + }, + "version": "v0" + }, + "refId": "A" + } + }, + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "rate(node_netstat_TcpExt_TCPRenoRecovery{node=~\"$node\"}[5m])", + "legendFormat": "{{node}} reno-recovery", + "range": true + }, + "version": "v0" + }, + "refId": "B" + } + }, + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "rate(node_netstat_TcpExt_TCPTimeouts{node=~\"$node\"}[5m])", + "legendFormat": "{{node}} rto", + "range": true + }, + "version": "v0" + }, + "refId": "C" + } + } + ], + "queryOptions": {}, + "transformations": [] + } + }, + "description": "TCPSackRecovery + TCPRenoRecovery (fast recovery, cwnd preserved) vs TCPTimeouts (RTO, cwnd collapsed). Healthy: fast dominates. Degraded: RTO dominates → severe / bursty loss.", + "id": 19, + "links": [], + "title": "H9: Fast recovery vs RTO", + "vizConfig": { + "group": "timeseries", + "kind": "VizConfig", + "spec": { + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", + "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", "showValues": false, "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "min": 0, + "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] }, + "unit": "ops" + }, + "overrides": [] + }, + "options": { + "annotations": { "clustering": -1, "multiLane": false }, + "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } + } + }, + "version": "13.0.1" + } + } + }, + "panel-20": { + "kind": "Panel", + "spec": { + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "rate(node_softnet_dropped_total{node=~\"$node\"}[1m])", + "legendFormat": "{{node}} cpu={{cpu}} dropped", + "range": true + }, + "version": "v0" + }, + "refId": "A" + } + }, + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "rate(node_softnet_times_squeezed_total{node=~\"$node\"}[1m])", + "legendFormat": "{{node}} cpu={{cpu}} squeezed", + "range": true + }, + "version": "v0" + }, + "refId": "B" + } + } + ], + "queryOptions": {}, + "transformations": [] + } + }, + "description": "node_softnet_dropped_total and _times_squeezed_total per CPU. squeezed > 0 on the CPU handling end0 IRQs = NAPI poll exhausted its budget; the NIC then drops at the ring buffer before TCP sees it. Connects H10 (softirq saturation) to H9 (downstream TCP timeouts).", + "id": 20, + "links": [], + "title": "H10: softnet drops & NAPI squeezes per CPU", + "vizConfig": { + "group": "timeseries", + "kind": "VizConfig", + "spec": { + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", + "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", "showValues": false, "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "min": 0, + "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] }, + "unit": "ops" + }, + "overrides": [] + }, + "options": { + "annotations": { "clustering": -1, "multiLane": false }, + "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } + } + }, + "version": "13.0.1" + } + } } }, "layout": { @@ -1389,7 +1651,9 @@ "items": [ { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-4" } } }, { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-5" } } }, - { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-6" } } } + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-6" } } }, + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-18" } } }, + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-19" } } } ], "maxColumnCount": 3, "rowHeightMode": "standard" @@ -1410,7 +1674,8 @@ { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-1" } } }, { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-2" } } }, { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-17" } } }, - { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-3" } } } + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-3" } } }, + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-20" } } } ], "maxColumnCount": 3, "rowHeightMode": "standard"