From 9d59dc4394ae607bcef6205974f59de48f1094c2 Mon Sep 17 00:00:00 2001
From: Ludovic Henry <git@ludovic.dev>
Date: Sun, 10 May 2026 18:16:35 +0000
Subject: [PATCH 01/10] Add raw_github_probe and dns_probe textfile scripts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These cover the two metrics in the debugging plan that no built-in
node_exporter collector exposes:

  raw_github_probe.py (5 min timer)
    Downloads a fixed artefact from raw.githubusercontent.com (Fastly,
    customer's actual path) and from speed.cloudflare.com (non-Fastly
    control). curl is shelled out so we get %{remote_ip} reflecting the
    IP libcurl actually connected to. Emits raw_github_probe_seconds,
    _bytes_per_second and _curl_exit_code labelled by target and
    remote_ip. When the Fastly target sags but the Cloudflare control
    stays flat, the issue is on the Scaleway-Fastly path (H9), not
    Scaleway WAN egress generally (H5).

  dns_probe.py (60 s timer)
    Resolves raw.githubusercontent.com via socket.getaddrinfo (same
    resolver libc/curl uses) and emits an info-style series
    runner_dns_resolved_ip{ip="..."}=1 plus a count. Lets us correlate
    slow windows with cache-region or POP flips (H1).

Both scripts run as the node_exporter user, write atomically (mkstemp +
os.replace) to the textfile collector dir, and self-recover from probe
failures by emitting curl_exit_code=99 instead of skipping the row.
Stdlib only — no extra apt packages needed.

https://claude.ai/code/session_01BgToCb8eDGsrkyTddCtt9Z
---
 scripts/scw.py | 216 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 216 insertions(+)

diff --git a/scripts/scw.py b/scripts/scw.py
index 34f3567..ff78161 100644
--- a/scripts/scw.py
+++ b/scripts/scw.py
@@ -864,6 +864,222 @@ def is_ready(res):
 sudo systemctl daemon-reload
 sudo systemctl enable prometheus-agent
 
+###############################################################################
+## Install probe scripts (textfile collector)
+
+# Two scripts cover the metrics in the debugging plan that no built-in
+# node_exporter collector exposes:
+#   - raw_github_probe.py: end-to-end download timing against
+#     raw.githubusercontent.com plus a non-Fastly comparison target. The
+#     headline metric for the slow-CI investigation.
+#   - dns_probe.py: which Fastly IPs raw.githubusercontent.com is currently
+#     resolving to, so we can correlate slow windows with cache-region flips.
+#
+# Implemented in Python (stdlib only). curl is shelled out for the
+# raw_github probe so we get %{remote_ip} reflecting the IP libcurl
+# actually connected to (matches the customer's traffic path more
+# faithfully than what socket.gethostbyname would tell us).
+
+cat <<'SCRIPT_EOF' | sudo tee /usr/local/bin/raw_github_probe.py >/dev/null
+#!/usr/bin/env python3
+# Synthetic probe: download a fixed test artefact from
+# raw.githubusercontent.com and a non-Fastly comparison target,
+# emit timing metrics for the node_exporter textfile collector.
+import os
+import subprocess
+import tempfile
+from pathlib import Path
+
+OUT = Path("/var/lib/node_exporter/textfile_collector/raw_github_probe.prom")
+
+TARGETS = [
+    # Fastly target: small stable file in the same repo as the customer's
+    # slow downloads (they fetch from usnistgov/ACVP-Server).
+    ("raw.githubusercontent.com",
+     "https://raw.githubusercontent.com/usnistgov/ACVP-Server/master/README.md"),
+    # Non-Fastly comparison: Cloudflare's well-known speed-test endpoint.
+    # When the Fastly target sags but this stays flat, the issue is on the
+    # Scaleway-Fastly path (H9), not Scaleway WAN egress in general (H5).
+    ("cloudflare-control",
+     "https://speed.cloudflare.com/__down?bytes=1048576"),
+]
+
+
+def probe(target: str, url: str) -> list[str]:
+    try:
+        result = subprocess.run(
+            [
+                "curl", "-o", "/dev/null", "-s", "--max-time", "30",
+                "-w", "%{time_total} %{speed_download} %{remote_ip} %{exitcode}\n",
+                url,
+            ],
+            capture_output=True, text=True, timeout=35,
+        )
+        fields = (result.stdout.strip() or "0 0 unknown 99").split()
+    except (subprocess.TimeoutExpired, FileNotFoundError):
+        fields = ["0", "0", "unknown", "99"]
+    fields = (fields + ["0", "0", "unknown", "99"])[:4]
+    time_total, speed, remote_ip, exit_code = fields
+    return [
+        f'raw_github_probe_seconds{{target="{target}",remote_ip="{remote_ip}"}} {time_total}',
+        f'raw_github_probe_bytes_per_second{{target="{target}"}} {speed}',
+        f'raw_github_probe_curl_exit_code{{target="{target}"}} {exit_code}',
+    ]
+
+
+def main() -> None:
+    lines = [
+        "# HELP raw_github_probe_seconds Wallclock to download a fixed test artefact.",
+        "# TYPE raw_github_probe_seconds gauge",
+        "# HELP raw_github_probe_bytes_per_second Average download throughput in bytes/sec.",
+        "# TYPE raw_github_probe_bytes_per_second gauge",
+        "# HELP raw_github_probe_curl_exit_code Curl exit code; 0 on success.",
+        "# TYPE raw_github_probe_curl_exit_code gauge",
+    ]
+    for target, url in TARGETS:
+        lines.extend(probe(target, url))
+
+    OUT.parent.mkdir(parents=True, exist_ok=True)
+    fd, tmp = tempfile.mkstemp(dir=str(OUT.parent), prefix=".raw_github_probe.")
+    try:
+        with os.fdopen(fd, "w") as f:
+            f.write("\n".join(lines) + "\n")
+        os.replace(tmp, OUT)
+    except Exception:
+        try:
+            os.unlink(tmp)
+        except FileNotFoundError:
+            pass
+        raise
+
+
+if __name__ == "__main__":
+    main()
+SCRIPT_EOF
+sudo chmod 0755 /usr/local/bin/raw_github_probe.py
+sudo chown root:root /usr/local/bin/raw_github_probe.py
+
+cat <<'SCRIPT_EOF' | sudo tee /usr/local/bin/dns_probe.py >/dev/null
+#!/usr/bin/env python3
+# DNS resolution snapshot for raw.githubusercontent.com — emit info-style
+# metrics so we can see which Fastly IPs each node sees over time.
+#
+# Resolves via socket.getaddrinfo, which uses the same resolver libc
+# would use, so the IPs we record are the same ones curl/the runner
+# agent would actually connect to.
+import os
+import socket
+import tempfile
+from pathlib import Path
+
+OUT = Path("/var/lib/node_exporter/textfile_collector/dns_probe.prom")
+HOST = "raw.githubusercontent.com"
+
+
+def resolve(host: str) -> list[str]:
+    ips: set[str] = set()
+    for family in (socket.AF_INET, socket.AF_INET6):
+        try:
+            for info in socket.getaddrinfo(host, None, family, socket.SOCK_STREAM):
+                ips.add(info[4][0])
+        except socket.gaierror:
+            pass
+    return sorted(ips)
+
+
+def main() -> None:
+    ips = resolve(HOST)
+    lines = [
+        "# HELP runner_dns_resolved_ip Info metric: 1 per IP currently resolved for HOST.",
+        "# TYPE runner_dns_resolved_ip gauge",
+        "# HELP runner_dns_resolved_ip_count Number of IPs returned for HOST.",
+        "# TYPE runner_dns_resolved_ip_count gauge",
+    ]
+    for ip in ips:
+        lines.append(f'runner_dns_resolved_ip{{host="{HOST}",ip="{ip}"}} 1')
+    lines.append(f'runner_dns_resolved_ip_count{{host="{HOST}"}} {len(ips)}')
+
+    OUT.parent.mkdir(parents=True, exist_ok=True)
+    fd, tmp = tempfile.mkstemp(dir=str(OUT.parent), prefix=".dns_probe.")
+    try:
+        with os.fdopen(fd, "w") as f:
+            f.write("\n".join(lines) + "\n")
+        os.replace(tmp, OUT)
+    except Exception:
+        try:
+            os.unlink(tmp)
+        except FileNotFoundError:
+            pass
+        raise
+
+
+if __name__ == "__main__":
+    main()
+SCRIPT_EOF
+sudo chmod 0755 /usr/local/bin/dns_probe.py
+sudo chown root:root /usr/local/bin/dns_probe.py
+
+# Systemd timers running each probe as the node_exporter user.
+cat <<'EOF' | sudo tee /etc/systemd/system/raw-github-probe.service
+[Unit]
+Description=Synthetic probe of raw.githubusercontent.com and a comparison target
+After=network-online.target
+Wants=network-online.target
+[Service]
+Type=oneshot
+User=node_exporter
+Group=node_exporter
+ExecStart=/usr/local/bin/raw_github_probe.py
+NoNewPrivileges=true
+ProtectSystem=strict
+ProtectHome=true
+PrivateTmp=true
+ReadWritePaths=/var/lib/node_exporter
+EOF
+
+cat <<'EOF' | sudo tee /etc/systemd/system/raw-github-probe.timer
+[Unit]
+Description=Run raw-github-probe every 5 minutes
+[Timer]
+OnBootSec=1min
+OnUnitActiveSec=5min
+Unit=raw-github-probe.service
+[Install]
+WantedBy=timers.target
+EOF
+
+cat <<'EOF' | sudo tee /etc/systemd/system/dns-probe.service
+[Unit]
+Description=DNS resolution snapshot for raw.githubusercontent.com
+After=network-online.target
+Wants=network-online.target
+[Service]
+Type=oneshot
+User=node_exporter
+Group=node_exporter
+ExecStart=/usr/local/bin/dns_probe.py
+NoNewPrivileges=true
+ProtectSystem=strict
+ProtectHome=true
+PrivateTmp=true
+ReadWritePaths=/var/lib/node_exporter
+EOF
+
+cat <<'EOF' | sudo tee /etc/systemd/system/dns-probe.timer
+[Unit]
+Description=Run dns-probe every 60 seconds
+[Timer]
+OnBootSec=30s
+OnUnitActiveSec=60s
+Unit=dns-probe.service
+[Install]
+WantedBy=timers.target
+EOF
+
+sudo systemctl daemon-reload
+sudo systemctl enable raw-github-probe.timer
+sudo systemctl enable dns-probe.timer
+
 ###############################################################################
 ## Install containerd
 

From bb57899694750b598dc543d879caac00db10050f Mon Sep 17 00:00:00 2001
From: Ludovic Henry <git@ludovic.dev>
Date: Sun, 10 May 2026 18:16:36 +0000
Subject: [PATCH 02/10] Add Grafana dashboard for buckets A and B (v2 schema)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Ten panels organized by hypothesis (H10, H9, sentinels), in the
Grafana v2alpha dashboard schema (kind/spec wrappers, AutoGridLayout,
elements map keyed by panel-N) used by Scaleway's managed Grafana.

  H10 (single-queue NIC / CPU0 softirq saturation)
    - CPU softirq% by CPU                 (default cpu collector)
    - NET_RX softirq deliveries/sec       (--collector.softirqs)
    - end0 hard IRQs/sec by CPU           (--collector.interrupts)

  H9 (TCP-stack health, Scaleway-Fastly path proxy)
    - TCPTimeouts/sec
    - Retransmit ratio with 0.5% threshold
    - Lost retransmits & spurious RTOs

  Sentinels
    - end0 throughput vs 100 Mbps line
    - end0 NIC drops & errors             (H2 should stay flat)
    - Conntrack utilisation               (H3 should stay << 0.01)
    - EEE / LPI enter rate                (H7 fallback; --collector.ethtool)

Datasource UID hardcoded to fflnugavx2h34c (Scaleway Cockpit metrics
data source for this project). Layout is AutoGridLayout with three
columns; panels flow into four rows. No template variables yet —
adding a node selector requires a v2-schema sample.

https://claude.ai/code/session_01BgToCb8eDGsrkyTddCtt9Z
---
 dashboards/network-health-buckets-a-b.json | 827 +++++++++++++++++++++
 1 file changed, 827 insertions(+)
 create mode 100644 dashboards/network-health-buckets-a-b.json

diff --git a/dashboards/network-health-buckets-a-b.json b/dashboards/network-health-buckets-a-b.json
new file mode 100644
index 0000000..9fbfc90
--- /dev/null
+++ b/dashboards/network-health-buckets-a-b.json
@@ -0,0 +1,827 @@
+{
+  "annotations": [
+    {
+      "kind": "AnnotationQuery",
+      "spec": {
+        "builtIn": true,
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "query": {
+          "datasource": { "name": "-- Grafana --" },
+          "group": "grafana",
+          "kind": "DataQuery",
+          "spec": {},
+          "version": "v0"
+        }
+      }
+    }
+  ],
+  "cursorSync": "Off",
+  "editable": true,
+  "elements": {
+    "panel-1": {
+      "kind": "Panel",
+      "spec": {
+        "data": {
+          "kind": "QueryGroup",
+          "spec": {
+            "queries": [
+              {
+                "kind": "PanelQuery",
+                "spec": {
+                  "hidden": false,
+                  "query": {
+                    "datasource": { "name": "fflnugavx2h34c" },
+                    "group": "prometheus",
+                    "kind": "DataQuery",
+                    "spec": {
+                      "editorMode": "code",
+                      "expr": "rate(node_cpu_seconds_total{mode=\"softirq\"}[1m])",
+                      "legendFormat": "{{node}} cpu={{cpu}}",
+                      "range": true
+                    },
+                    "version": "v0"
+                  },
+                  "refId": "A"
+                }
+              }
+            ],
+            "queryOptions": {},
+            "transformations": []
+          }
+        },
+        "description": "rate(node_cpu_seconds_total{mode=\"softirq\"}). H10 primary signal — CPU0 approaching 1.0 while others stay near 0 = saturation. Threshold line at 0.9.",
+        "id": 1,
+        "links": [],
+        "title": "H10: CPU softirq% by CPU",
+        "vizConfig": {
+          "group": "timeseries",
+          "kind": "VizConfig",
+          "spec": {
+            "fieldConfig": {
+              "defaults": {
+                "color": { "mode": "palette-classic" },
+                "custom": {
+                  "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto",
+                  "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none",
+                  "hideFrom": { "legend": false, "tooltip": false, "viz": false },
+                  "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5,
+                  "scaleDistribution": { "type": "linear" },
+                  "showPoints": "auto", "showValues": false, "spanNulls": false,
+                  "stacking": { "group": "A", "mode": "none" },
+                  "thresholdsStyle": { "mode": "line" }
+                },
+                "min": 0,
+                "max": 1,
+                "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                    { "color": "green", "value": 0 },
+                    { "color": "red", "value": 0.9 }
+                  ]
+                },
+                "unit": "percentunit"
+              },
+              "overrides": []
+            },
+            "options": {
+              "annotations": { "clustering": -1, "multiLane": false },
+              "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true },
+              "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" }
+            }
+          },
+          "version": "13.0.1"
+        }
+      }
+    },
+    "panel-2": {
+      "kind": "Panel",
+      "spec": {
+        "data": {
+          "kind": "QueryGroup",
+          "spec": {
+            "queries": [
+              {
+                "kind": "PanelQuery",
+                "spec": {
+                  "hidden": false,
+                  "query": {
+                    "datasource": { "name": "fflnugavx2h34c" },
+                    "group": "prometheus",
+                    "kind": "DataQuery",
+                    "spec": {
+                      "editorMode": "code",
+                      "expr": "rate(node_softirq_total{type=\"NET_RX\"}[1m])",
+                      "legendFormat": "{{node}} cpu={{cpu}}",
+                      "range": true
+                    },
+                    "version": "v0"
+                  },
+                  "refId": "A"
+                }
+              }
+            ],
+            "queryOptions": {},
+            "transformations": []
+          }
+        },
+        "description": "rate(node_softirq_total{type=\"NET_RX\"}). Cross-check for the CPU softirq% panel. Needs --collector.softirqs.",
+        "id": 2,
+        "links": [],
+        "title": "H10: NET_RX softirq deliveries/sec per CPU",
+        "vizConfig": {
+          "group": "timeseries",
+          "kind": "VizConfig",
+          "spec": {
+            "fieldConfig": {
+              "defaults": {
+                "color": { "mode": "palette-classic" },
+                "custom": {
+                  "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto",
+                  "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none",
+                  "hideFrom": { "legend": false, "tooltip": false, "viz": false },
+                  "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5,
+                  "scaleDistribution": { "type": "linear" },
+                  "showPoints": "auto", "showValues": false, "spanNulls": false,
+                  "stacking": { "group": "A", "mode": "none" },
+                  "thresholdsStyle": { "mode": "off" }
+                },
+                "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] },
+                "unit": "ops"
+              },
+              "overrides": []
+            },
+            "options": {
+              "annotations": { "clustering": -1, "multiLane": false },
+              "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true },
+              "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" }
+            }
+          },
+          "version": "13.0.1"
+        }
+      }
+    },
+    "panel-3": {
+      "kind": "Panel",
+      "spec": {
+        "data": {
+          "kind": "QueryGroup",
+          "spec": {
+            "queries": [
+              {
+                "kind": "PanelQuery",
+                "spec": {
+                  "hidden": false,
+                  "query": {
+                    "datasource": { "name": "fflnugavx2h34c" },
+                    "group": "prometheus",
+                    "kind": "DataQuery",
+                    "spec": {
+                      "editorMode": "code",
+                      "expr": "rate(node_interrupts_total{type=~\".*end0.*\"}[1m])",
+                      "legendFormat": "{{node}} cpu={{cpu}} type={{type}}",
+                      "range": true
+                    },
+                    "version": "v0"
+                  },
+                  "refId": "A"
+                }
+              }
+            ],
+            "queryOptions": {},
+            "transformations": []
+          }
+        },
+        "description": "rate(node_interrupts_total{type=~\".*end0.*\"}). Single-queue light_dwmac_eth pins all end0 IRQs to one CPU; this should structurally show one line > 0 and the rest at 0. Needs --collector.interrupts.",
+        "id": 3,
+        "links": [],
+        "title": "H10: end0 hard IRQs/sec by CPU",
+        "vizConfig": {
+          "group": "timeseries",
+          "kind": "VizConfig",
+          "spec": {
+            "fieldConfig": {
+              "defaults": {
+                "color": { "mode": "palette-classic" },
+                "custom": {
+                  "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto",
+                  "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none",
+                  "hideFrom": { "legend": false, "tooltip": false, "viz": false },
+                  "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5,
+                  "scaleDistribution": { "type": "linear" },
+                  "showPoints": "auto", "showValues": false, "spanNulls": false,
+                  "stacking": { "group": "A", "mode": "none" },
+                  "thresholdsStyle": { "mode": "off" }
+                },
+                "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] },
+                "unit": "ops"
+              },
+              "overrides": []
+            },
+            "options": {
+              "annotations": { "clustering": -1, "multiLane": false },
+              "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true },
+              "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" }
+            }
+          },
+          "version": "13.0.1"
+        }
+      }
+    },
+    "panel-4": {
+      "kind": "Panel",
+      "spec": {
+        "data": {
+          "kind": "QueryGroup",
+          "spec": {
+            "queries": [
+              {
+                "kind": "PanelQuery",
+                "spec": {
+                  "hidden": false,
+                  "query": {
+                    "datasource": { "name": "fflnugavx2h34c" },
+                    "group": "prometheus",
+                    "kind": "DataQuery",
+                    "spec": {
+                      "editorMode": "code",
+                      "expr": "rate(node_netstat_TcpExt_TCPTimeouts[5m])",
+                      "legendFormat": "{{node}}",
+                      "range": true
+                    },
+                    "version": "v0"
+                  },
+                  "refId": "A"
+                }
+              }
+            ],
+            "queryOptions": {},
+            "transformations": []
+          }
+        },
+        "description": "rate(node_netstat_TcpExt_TCPTimeouts). Each timeout = up to 120 s of RTO backoff. H9 primary.",
+        "id": 4,
+        "links": [],
+        "title": "H9: TCPTimeouts/sec",
+        "vizConfig": {
+          "group": "timeseries",
+          "kind": "VizConfig",
+          "spec": {
+            "fieldConfig": {
+              "defaults": {
+                "color": { "mode": "palette-classic" },
+                "custom": {
+                  "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto",
+                  "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none",
+                  "hideFrom": { "legend": false, "tooltip": false, "viz": false },
+                  "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5,
+                  "scaleDistribution": { "type": "linear" },
+                  "showPoints": "auto", "showValues": false, "spanNulls": false,
+                  "stacking": { "group": "A", "mode": "none" },
+                  "thresholdsStyle": { "mode": "off" }
+                },
+                "min": 0,
+                "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] },
+                "unit": "ops"
+              },
+              "overrides": []
+            },
+            "options": {
+              "annotations": { "clustering": -1, "multiLane": false },
+              "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "list", "placement": "bottom", "showLegend": true },
+              "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" }
+            }
+          },
+          "version": "13.0.1"
+        }
+      }
+    },
+    "panel-5": {
+      "kind": "Panel",
+      "spec": {
+        "data": {
+          "kind": "QueryGroup",
+          "spec": {
+            "queries": [
+              {
+                "kind": "PanelQuery",
+                "spec": {
+                  "hidden": false,
+                  "query": {
+                    "datasource": { "name": "fflnugavx2h34c" },
+                    "group": "prometheus",
+                    "kind": "DataQuery",
+                    "spec": {
+                      "editorMode": "code",
+                      "expr": "rate(node_netstat_Tcp_RetransSegs[5m]) / clamp_min(rate(node_netstat_Tcp_OutSegs[5m]), 1)",
+                      "legendFormat": "{{node}}",
+                      "range": true
+                    },
+                    "version": "v0"
+                  },
+                  "refId": "A"
+                }
+              }
+            ],
+            "queryOptions": {},
+            "transformations": []
+          }
+        },
+        "description": "Healthy < 0.5 %; threshold line at 0.005.",
+        "id": 5,
+        "links": [],
+        "title": "H9: Retransmit ratio (RetransSegs / OutSegs)",
+        "vizConfig": {
+          "group": "timeseries",
+          "kind": "VizConfig",
+          "spec": {
+            "fieldConfig": {
+              "defaults": {
+                "color": { "mode": "palette-classic" },
+                "custom": {
+                  "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto",
+                  "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none",
+                  "hideFrom": { "legend": false, "tooltip": false, "viz": false },
+                  "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5,
+                  "scaleDistribution": { "type": "linear" },
+                  "showPoints": "auto", "showValues": false, "spanNulls": false,
+                  "stacking": { "group": "A", "mode": "none" },
+                  "thresholdsStyle": { "mode": "line" }
+                },
+                "min": 0,
+                "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                    { "color": "green", "value": 0 },
+                    { "color": "red", "value": 0.005 }
+                  ]
+                },
+                "unit": "percentunit"
+              },
+              "overrides": []
+            },
+            "options": {
+              "annotations": { "clustering": -1, "multiLane": false },
+              "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "list", "placement": "bottom", "showLegend": true },
+              "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" }
+            }
+          },
+          "version": "13.0.1"
+        }
+      }
+    },
+    "panel-6": {
+      "kind": "Panel",
+      "spec": {
+        "data": {
+          "kind": "QueryGroup",
+          "spec": {
+            "queries": [
+              {
+                "kind": "PanelQuery",
+                "spec": {
+                  "hidden": false,
+                  "query": {
+                    "datasource": { "name": "fflnugavx2h34c" },
+                    "group": "prometheus",
+                    "kind": "DataQuery",
+                    "spec": {
+                      "editorMode": "code",
+                      "expr": "rate(node_netstat_TcpExt_TCPLostRetransmit[5m])",
+                      "legendFormat": "{{node}} lost-retransmit",
+                      "range": true
+                    },
+                    "version": "v0"
+                  },
+                  "refId": "A"
+                }
+              },
+              {
+                "kind": "PanelQuery",
+                "spec": {
+                  "hidden": false,
+                  "query": {
+                    "datasource": { "name": "fflnugavx2h34c" },
+                    "group": "prometheus",
+                    "kind": "DataQuery",
+                    "spec": {
+                      "editorMode": "code",
+                      "expr": "rate(node_netstat_TcpExt_TCPSpuriousRTOs[5m])",
+                      "legendFormat": "{{node}} spurious-rto",
+                      "range": true
+                    },
+                    "version": "v0"
+                  },
+                  "refId": "B"
+                }
+              }
+            ],
+            "queryOptions": {},
+            "transformations": []
+          }
+        },
+        "description": "TCPLostRetransmit and TCPSpuriousRTOs. Distinguishes real loss from spurious RTO detection.",
+        "id": 6,
+        "links": [],
+        "title": "H9: Lost retransmits & spurious RTOs",
+        "vizConfig": {
+          "group": "timeseries",
+          "kind": "VizConfig",
+          "spec": {
+            "fieldConfig": {
+              "defaults": {
+                "color": { "mode": "palette-classic" },
+                "custom": {
+                  "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto",
+                  "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none",
+                  "hideFrom": { "legend": false, "tooltip": false, "viz": false },
+                  "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5,
+                  "scaleDistribution": { "type": "linear" },
+                  "showPoints": "auto", "showValues": false, "spanNulls": false,
+                  "stacking": { "group": "A", "mode": "none" },
+                  "thresholdsStyle": { "mode": "off" }
+                },
+                "min": 0,
+                "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] },
+                "unit": "ops"
+              },
+              "overrides": []
+            },
+            "options": {
+              "annotations": { "clustering": -1, "multiLane": false },
+              "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true },
+              "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" }
+            }
+          },
+          "version": "13.0.1"
+        }
+      }
+    },
+    "panel-7": {
+      "kind": "Panel",
+      "spec": {
+        "data": {
+          "kind": "QueryGroup",
+          "spec": {
+            "queries": [
+              {
+                "kind": "PanelQuery",
+                "spec": {
+                  "hidden": false,
+                  "query": {
+                    "datasource": { "name": "fflnugavx2h34c" },
+                    "group": "prometheus",
+                    "kind": "DataQuery",
+                    "spec": {
+                      "editorMode": "code",
+                      "expr": "rate(node_network_receive_bytes_total{device=\"end0\"}[1m]) * 8 / 1e6",
+                      "legendFormat": "{{node}} rx",
+                      "range": true
+                    },
+                    "version": "v0"
+                  },
+                  "refId": "A"
+                }
+              },
+              {
+                "kind": "PanelQuery",
+                "spec": {
+                  "hidden": false,
+                  "query": {
+                    "datasource": { "name": "fflnugavx2h34c" },
+                    "group": "prometheus",
+                    "kind": "DataQuery",
+                    "spec": {
+                      "editorMode": "code",
+                      "expr": "rate(node_network_transmit_bytes_total{device=\"end0\"}[1m]) * 8 / 1e6",
+                      "legendFormat": "{{node}} tx",
+                      "range": true
+                    },
+                    "version": "v0"
+                  },
+                  "refId": "B"
+                }
+              }
+            ],
+            "queryOptions": {},
+            "transformations": []
+          }
+        },
+        "description": "rate(node_network_receive/transmit_bytes_total{device=\"end0\"}) * 8 / 1e6. Threshold at 90 Mbps (10 % shy of the 100 Mbps line).",
+        "id": 7,
+        "links": [],
+        "title": "Sentinel: end0 throughput (Mbps) vs 100 Mbps line",
+        "vizConfig": {
+          "group": "timeseries",
+          "kind": "VizConfig",
+          "spec": {
+            "fieldConfig": {
+              "defaults": {
+                "color": { "mode": "palette-classic" },
+                "custom": {
+                  "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto",
+                  "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none",
+                  "hideFrom": { "legend": false, "tooltip": false, "viz": false },
+                  "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5,
+                  "scaleDistribution": { "type": "linear" },
+                  "showPoints": "auto", "showValues": false, "spanNulls": false,
+                  "stacking": { "group": "A", "mode": "none" },
+                  "thresholdsStyle": { "mode": "line" }
+                },
+                "min": 0,
+                "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                    { "color": "green", "value": 0 },
+                    { "color": "orange", "value": 90 }
+                  ]
+                },
+                "unit": "Mbits"
+              },
+              "overrides": []
+            },
+            "options": {
+              "annotations": { "clustering": -1, "multiLane": false },
+              "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true },
+              "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" }
+            }
+          },
+          "version": "13.0.1"
+        }
+      }
+    },
+    "panel-8": {
+      "kind": "Panel",
+      "spec": {
+        "data": {
+          "kind": "QueryGroup",
+          "spec": {
+            "queries": [
+              {
+                "kind": "PanelQuery",
+                "spec": {
+                  "hidden": false,
+                  "query": {
+                    "datasource": { "name": "fflnugavx2h34c" },
+                    "group": "prometheus",
+                    "kind": "DataQuery",
+                    "spec": {
+                      "editorMode": "code",
+                      "expr": "rate(node_network_receive_drop_total{device=\"end0\"}[5m])",
+                      "legendFormat": "{{node}} rx-drop",
+                      "range": true
+                    },
+                    "version": "v0"
+                  },
+                  "refId": "A"
+                }
+              },
+              {
+                "kind": "PanelQuery",
+                "spec": {
+                  "hidden": false,
+                  "query": {
+                    "datasource": { "name": "fflnugavx2h34c" },
+                    "group": "prometheus",
+                    "kind": "DataQuery",
+                    "spec": {
+                      "editorMode": "code",
+                      "expr": "rate(node_network_receive_errs_total{device=\"end0\"}[5m])",
+                      "legendFormat": "{{node}} rx-errs",
+                      "range": true
+                    },
+                    "version": "v0"
+                  },
+                  "refId": "B"
+                }
+              }
+            ],
+            "queryOptions": {},
+            "transformations": []
+          }
+        },
+        "description": "rate(node_network_receive_{drop,errs}_total{device=\"end0\"}). Should stay flat at 0; any climb reopens H2.",
+        "id": 8,
+        "links": [],
+        "title": "Sentinel: end0 NIC drops & errors (H2)",
+        "vizConfig": {
+          "group": "timeseries",
+          "kind": "VizConfig",
+          "spec": {
+            "fieldConfig": {
+              "defaults": {
+                "color": { "mode": "palette-classic" },
+                "custom": {
+                  "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto",
+                  "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none",
+                  "hideFrom": { "legend": false, "tooltip": false, "viz": false },
+                  "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5,
+                  "scaleDistribution": { "type": "linear" },
+                  "showPoints": "auto", "showValues": false, "spanNulls": false,
+                  "stacking": { "group": "A", "mode": "none" },
+                  "thresholdsStyle": { "mode": "off" }
+                },
+                "min": 0,
+                "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] },
+                "unit": "ops"
+              },
+              "overrides": []
+            },
+            "options": {
+              "annotations": { "clustering": -1, "multiLane": false },
+              "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true },
+              "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" }
+            }
+          },
+          "version": "13.0.1"
+        }
+      }
+    },
+    "panel-9": {
+      "kind": "Panel",
+      "spec": {
+        "data": {
+          "kind": "QueryGroup",
+          "spec": {
+            "queries": [
+              {
+                "kind": "PanelQuery",
+                "spec": {
+                  "hidden": false,
+                  "query": {
+                    "datasource": { "name": "fflnugavx2h34c" },
+                    "group": "prometheus",
+                    "kind": "DataQuery",
+                    "spec": {
+                      "editorMode": "code",
+                      "expr": "node_nf_conntrack_entries / node_nf_conntrack_entries_limit",
+                      "legendFormat": "{{node}}",
+                      "range": true
+                    },
+                    "version": "v0"
+                  },
+                  "refId": "A"
+                }
+              }
+            ],
+            "queryOptions": {},
+            "transformations": []
+          }
+        },
+        "description": "node_nf_conntrack_entries / _limit. Should stay << 0.01 on a healthy runner; H3 fully refuted on riscv-runner-20.",
+        "id": 9,
+        "links": [],
+        "title": "Sentinel: Conntrack utilisation (H3)",
+        "vizConfig": {
+          "group": "timeseries",
+          "kind": "VizConfig",
+          "spec": {
+            "fieldConfig": {
+              "defaults": {
+                "color": { "mode": "palette-classic" },
+                "custom": {
+                  "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto",
+                  "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none",
+                  "hideFrom": { "legend": false, "tooltip": false, "viz": false },
+                  "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5,
+                  "scaleDistribution": { "type": "linear" },
+                  "showPoints": "auto", "showValues": false, "spanNulls": false,
+                  "stacking": { "group": "A", "mode": "none" },
+                  "thresholdsStyle": { "mode": "line" }
+                },
+                "min": 0,
+                "max": 1,
+                "thresholds": {
+                  "mode": "absolute",
+                  "steps": [
+                    { "color": "green", "value": 0 },
+                    { "color": "red", "value": 0.5 }
+                  ]
+                },
+                "unit": "percentunit"
+              },
+              "overrides": []
+            },
+            "options": {
+              "annotations": { "clustering": -1, "multiLane": false },
+              "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "list", "placement": "bottom", "showLegend": true },
+              "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" }
+            }
+          },
+          "version": "13.0.1"
+        }
+      }
+    },
+    "panel-10": {
+      "kind": "Panel",
+      "spec": {
+        "data": {
+          "kind": "QueryGroup",
+          "spec": {
+            "queries": [
+              {
+                "kind": "PanelQuery",
+                "spec": {
+                  "hidden": false,
+                  "query": {
+                    "datasource": { "name": "fflnugavx2h34c" },
+                    "group": "prometheus",
+                    "kind": "DataQuery",
+                    "spec": {
+                      "editorMode": "code",
+                      "expr": "rate(node_ethtool_irq_tx_path_in_lpi_mode_n{device=\"end0\"}[1m])",
+                      "legendFormat": "{{node}}",
+                      "range": true
+                    },
+                    "version": "v0"
+                  },
+                  "refId": "A"
+                }
+              }
+            ],
+            "queryOptions": {},
+            "transformations": []
+          }
+        },
+        "description": "rate(node_ethtool_irq_tx_path_in_lpi_mode_n). LPI being entered constantly mostly means link is idle; only interesting if a slow-run window correlates with a marked change. Needs --collector.ethtool.",
+        "id": 10,
+        "links": [],
+        "title": "Sentinel: EEE / LPI enter rate (H7 fallback)",
+        "vizConfig": {
+          "group": "timeseries",
+          "kind": "VizConfig",
+          "spec": {
+            "fieldConfig": {
+              "defaults": {
+                "color": { "mode": "palette-classic" },
+                "custom": {
+                  "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto",
+                  "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none",
+                  "hideFrom": { "legend": false, "tooltip": false, "viz": false },
+                  "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5,
+                  "scaleDistribution": { "type": "linear" },
+                  "showPoints": "auto", "showValues": false, "spanNulls": false,
+                  "stacking": { "group": "A", "mode": "none" },
+                  "thresholdsStyle": { "mode": "off" }
+                },
+                "min": 0,
+                "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] },
+                "unit": "ops"
+              },
+              "overrides": []
+            },
+            "options": {
+              "annotations": { "clustering": -1, "multiLane": false },
+              "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "list", "placement": "bottom", "showLegend": true },
+              "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" }
+            }
+          },
+          "version": "13.0.1"
+        }
+      }
+    }
+  },
+  "layout": {
+    "kind": "AutoGridLayout",
+    "spec": {
+      "columnWidthMode": "standard",
+      "items": [
+        { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-1" } } },
+        { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-2" } } },
+        { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-3" } } },
+        { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-4" } } },
+        { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-5" } } },
+        { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-6" } } },
+        { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-7" } } },
+        { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-8" } } },
+        { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-9" } } },
+        { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-10" } } }
+      ],
+      "maxColumnCount": 3,
+      "rowHeightMode": "standard"
+    }
+  },
+  "links": [],
+  "liveNow": false,
+  "preferences": {
+    "layout": {
+      "kind": "AutoGridLayout",
+      "spec": { "columnWidthMode": "standard", "items": [], "maxColumnCount": 3, "rowHeightMode": "standard" }
+    }
+  },
+  "preload": false,
+  "tags": ["rise", "riscv-runner", "network"],
+  "timeSettings": {
+    "autoRefresh": "30s",
+    "autoRefreshIntervals": ["5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d"],
+    "fiscalYearStartMonth": 0,
+    "from": "now-1h",
+    "hideTimepicker": false,
+    "timezone": "browser",
+    "to": "now"
+  },
+  "title": "RISE RISC-V runner — network health (Buckets A+B)",
+  "variables": []
+}

From 8ca84fe4f5298e4ae073439c6fe42e67d996209e Mon Sep 17 00:00:00 2001
From: Ludovic Henry <git@ludovic.dev>
Date: Sun, 10 May 2026 18:25:24 +0000
Subject: [PATCH 03/10] Add TX drops/errs panel + node selector variable

- panel-11 (Sentinel: end0 TX drops & errors) mirrors panel-8 against
  node_network_transmit_{drop,errs}_total. Layout puts panels 7, 8, 11
  in the same row so throughput, RX errs, and TX errs sit visually
  grouped under one NIC sentinel band.
- panel-8 renamed to "RX drops & errors" to pair cleanly with the new
  TX panel.
- New `node` query variable (multi, includeAll) using
  label_values(node) so the dashboard can be filtered per-node. Every
  panel query now selects on node=~"$node" so the variable actually
  scopes results.

https://claude.ai/code/session_01BgToCb8eDGsrkyTddCtt9Z
---
 dashboards/network-health-buckets-a-b.json | 155 ++++++++++++++++++---
 1 file changed, 137 insertions(+), 18 deletions(-)

diff --git a/dashboards/network-health-buckets-a-b.json b/dashboards/network-health-buckets-a-b.json
index 9fbfc90..6dd8d3d 100644
--- a/dashboards/network-health-buckets-a-b.json
+++ b/dashboards/network-health-buckets-a-b.json
@@ -38,7 +38,7 @@
                     "kind": "DataQuery",
                     "spec": {
                       "editorMode": "code",
-                      "expr": "rate(node_cpu_seconds_total{mode=\"softirq\"}[1m])",
+                      "expr": "rate(node_cpu_seconds_total{mode=\"softirq\",node=~\"$node\"}[1m])",
                       "legendFormat": "{{node}} cpu={{cpu}}",
                       "range": true
                     },
@@ -73,8 +73,8 @@
                   "stacking": { "group": "A", "mode": "none" },
                   "thresholdsStyle": { "mode": "line" }
                 },
-                "min": 0,
                 "max": 1,
+                "min": 0,
                 "thresholds": {
                   "mode": "absolute",
                   "steps": [
@@ -113,7 +113,7 @@
                     "kind": "DataQuery",
                     "spec": {
                       "editorMode": "code",
-                      "expr": "rate(node_softirq_total{type=\"NET_RX\"}[1m])",
+                      "expr": "rate(node_softirq_total{type=\"NET_RX\",node=~\"$node\"}[1m])",
                       "legendFormat": "{{node}} cpu={{cpu}}",
                       "range": true
                     },
@@ -180,7 +180,7 @@
                     "kind": "DataQuery",
                     "spec": {
                       "editorMode": "code",
-                      "expr": "rate(node_interrupts_total{type=~\".*end0.*\"}[1m])",
+                      "expr": "rate(node_interrupts_total{type=~\".*end0.*\",node=~\"$node\"}[1m])",
                       "legendFormat": "{{node}} cpu={{cpu}} type={{type}}",
                       "range": true
                     },
@@ -247,7 +247,7 @@
                     "kind": "DataQuery",
                     "spec": {
                       "editorMode": "code",
-                      "expr": "rate(node_netstat_TcpExt_TCPTimeouts[5m])",
+                      "expr": "rate(node_netstat_TcpExt_TCPTimeouts{node=~\"$node\"}[5m])",
                       "legendFormat": "{{node}}",
                       "range": true
                     },
@@ -315,7 +315,7 @@
                     "kind": "DataQuery",
                     "spec": {
                       "editorMode": "code",
-                      "expr": "rate(node_netstat_Tcp_RetransSegs[5m]) / clamp_min(rate(node_netstat_Tcp_OutSegs[5m]), 1)",
+                      "expr": "rate(node_netstat_Tcp_RetransSegs{node=~\"$node\"}[5m]) / clamp_min(rate(node_netstat_Tcp_OutSegs{node=~\"$node\"}[5m]), 1)",
                       "legendFormat": "{{node}}",
                       "range": true
                     },
@@ -389,7 +389,7 @@
                     "kind": "DataQuery",
                     "spec": {
                       "editorMode": "code",
-                      "expr": "rate(node_netstat_TcpExt_TCPLostRetransmit[5m])",
+                      "expr": "rate(node_netstat_TcpExt_TCPLostRetransmit{node=~\"$node\"}[5m])",
                       "legendFormat": "{{node}} lost-retransmit",
                       "range": true
                     },
@@ -408,7 +408,7 @@
                     "kind": "DataQuery",
                     "spec": {
                       "editorMode": "code",
-                      "expr": "rate(node_netstat_TcpExt_TCPSpuriousRTOs[5m])",
+                      "expr": "rate(node_netstat_TcpExt_TCPSpuriousRTOs{node=~\"$node\"}[5m])",
                       "legendFormat": "{{node}} spurious-rto",
                       "range": true
                     },
@@ -476,7 +476,7 @@
                     "kind": "DataQuery",
                     "spec": {
                       "editorMode": "code",
-                      "expr": "rate(node_network_receive_bytes_total{device=\"end0\"}[1m]) * 8 / 1e6",
+                      "expr": "rate(node_network_receive_bytes_total{device=\"end0\",node=~\"$node\"}[1m]) * 8 / 1e6",
                       "legendFormat": "{{node}} rx",
                       "range": true
                     },
@@ -495,7 +495,7 @@
                     "kind": "DataQuery",
                     "spec": {
                       "editorMode": "code",
-                      "expr": "rate(node_network_transmit_bytes_total{device=\"end0\"}[1m]) * 8 / 1e6",
+                      "expr": "rate(node_network_transmit_bytes_total{device=\"end0\",node=~\"$node\"}[1m]) * 8 / 1e6",
                       "legendFormat": "{{node}} tx",
                       "range": true
                     },
@@ -569,7 +569,7 @@
                     "kind": "DataQuery",
                     "spec": {
                       "editorMode": "code",
-                      "expr": "rate(node_network_receive_drop_total{device=\"end0\"}[5m])",
+                      "expr": "rate(node_network_receive_drop_total{device=\"end0\",node=~\"$node\"}[5m])",
                       "legendFormat": "{{node}} rx-drop",
                       "range": true
                     },
@@ -588,7 +588,7 @@
                     "kind": "DataQuery",
                     "spec": {
                       "editorMode": "code",
-                      "expr": "rate(node_network_receive_errs_total{device=\"end0\"}[5m])",
+                      "expr": "rate(node_network_receive_errs_total{device=\"end0\",node=~\"$node\"}[5m])",
                       "legendFormat": "{{node}} rx-errs",
                       "range": true
                     },
@@ -602,10 +602,10 @@
             "transformations": []
           }
         },
-        "description": "rate(node_network_receive_{drop,errs}_total{device=\"end0\"}). Should stay flat at 0; any climb reopens H2.",
+        "description": "rate(node_network_receive_{drop,errs}_total{device=\"end0\"}). Should stay flat at 0; any climb on the receive path reopens H2.",
         "id": 8,
         "links": [],
-        "title": "Sentinel: end0 NIC drops & errors (H2)",
+        "title": "Sentinel: end0 RX drops & errors (H2)",
         "vizConfig": {
           "group": "timeseries",
           "kind": "VizConfig",
@@ -656,7 +656,7 @@
                     "kind": "DataQuery",
                     "spec": {
                       "editorMode": "code",
-                      "expr": "node_nf_conntrack_entries / node_nf_conntrack_entries_limit",
+                      "expr": "node_nf_conntrack_entries{node=~\"$node\"} / node_nf_conntrack_entries_limit{node=~\"$node\"}",
                       "legendFormat": "{{node}}",
                       "range": true
                     },
@@ -691,8 +691,8 @@
                   "stacking": { "group": "A", "mode": "none" },
                   "thresholdsStyle": { "mode": "line" }
                 },
-                "min": 0,
                 "max": 1,
+                "min": 0,
                 "thresholds": {
                   "mode": "absolute",
                   "steps": [
@@ -731,7 +731,7 @@
                     "kind": "DataQuery",
                     "spec": {
                       "editorMode": "code",
-                      "expr": "rate(node_ethtool_irq_tx_path_in_lpi_mode_n{device=\"end0\"}[1m])",
+                      "expr": "rate(node_ethtool_irq_tx_path_in_lpi_mode_n{device=\"end0\",node=~\"$node\"}[1m])",
                       "legendFormat": "{{node}}",
                       "range": true
                     },
@@ -781,6 +781,93 @@
           "version": "13.0.1"
         }
       }
+    },
+    "panel-11": {
+      "kind": "Panel",
+      "spec": {
+        "data": {
+          "kind": "QueryGroup",
+          "spec": {
+            "queries": [
+              {
+                "kind": "PanelQuery",
+                "spec": {
+                  "hidden": false,
+                  "query": {
+                    "datasource": { "name": "fflnugavx2h34c" },
+                    "group": "prometheus",
+                    "kind": "DataQuery",
+                    "spec": {
+                      "editorMode": "code",
+                      "expr": "rate(node_network_transmit_drop_total{device=\"end0\",node=~\"$node\"}[5m])",
+                      "legendFormat": "{{node}} tx-drop",
+                      "range": true
+                    },
+                    "version": "v0"
+                  },
+                  "refId": "A"
+                }
+              },
+              {
+                "kind": "PanelQuery",
+                "spec": {
+                  "hidden": false,
+                  "query": {
+                    "datasource": { "name": "fflnugavx2h34c" },
+                    "group": "prometheus",
+                    "kind": "DataQuery",
+                    "spec": {
+                      "editorMode": "code",
+                      "expr": "rate(node_network_transmit_errs_total{device=\"end0\",node=~\"$node\"}[5m])",
+                      "legendFormat": "{{node}} tx-errs",
+                      "range": true
+                    },
+                    "version": "v0"
+                  },
+                  "refId": "B"
+                }
+              }
+            ],
+            "queryOptions": {},
+            "transformations": []
+          }
+        },
+        "description": "rate(node_network_transmit_{drop,errs}_total{device=\"end0\"}). Should stay flat at 0; any climb on the transmit path means egress trouble.",
+        "id": 11,
+        "links": [],
+        "title": "Sentinel: end0 TX drops & errors",
+        "vizConfig": {
+          "group": "timeseries",
+          "kind": "VizConfig",
+          "spec": {
+            "fieldConfig": {
+              "defaults": {
+                "color": { "mode": "palette-classic" },
+                "custom": {
+                  "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto",
+                  "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none",
+                  "hideFrom": { "legend": false, "tooltip": false, "viz": false },
+                  "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5,
+                  "scaleDistribution": { "type": "linear" },
+                  "showPoints": "auto", "showValues": false, "spanNulls": false,
+                  "stacking": { "group": "A", "mode": "none" },
+                  "thresholdsStyle": { "mode": "off" }
+                },
+                "min": 0,
+                "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] },
+                "unit": "ops"
+              },
+              "overrides": []
+            },
+            "options": {
+              "annotations": { "clustering": -1, "multiLane": false },
+              "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true },
+              "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" }
+            }
+          },
+          "version": "13.0.1"
+        }
+      }
     }
   },
   "layout": {
@@ -796,6 +883,7 @@
         { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-6" } } },
         { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-7" } } },
         { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-8" } } },
+        { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-11" } } },
         { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-9" } } },
         { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-10" } } }
       ],
@@ -823,5 +911,36 @@
     "to": "now"
   },
   "title": "RISE RISC-V runner — network health (Buckets A+B)",
-  "variables": []
+  "variables": [
+    {
+      "kind": "QueryVariable",
+      "spec": {
+        "allowCustomValue": false,
+        "current": { "text": ["$__all"], "value": ["$__all"] },
+        "definition": "label_values(node)",
+        "hide": "dontHide",
+        "includeAll": true,
+        "label": "Node",
+        "multi": true,
+        "name": "node",
+        "options": [],
+        "query": {
+          "datasource": { "name": "fflnugavx2h34c" },
+          "group": "prometheus",
+          "kind": "DataQuery",
+          "spec": {
+            "qryType": 1,
+            "query": "label_values(node)",
+            "refId": "PrometheusVariableQueryEditor-VariableQuery"
+          },
+          "version": "v0"
+        },
+        "refresh": "onDashboardLoad",
+        "regex": "",
+        "regexApplyTo": "value",
+        "skipUrlSync": false,
+        "sort": "alphabeticalAsc"
+      }
+    }
+  ]
 }

From afc48e729f39f298ca185b10d9ed9bc0db4e2100 Mon Sep 17 00:00:00 2001
From: Ludovic Henry <git@ludovic.dev>
Date: Sun, 10 May 2026 18:31:08 +0000
Subject: [PATCH 04/10] Fix node_softirqs_total typo + widen netstat.fields
 filter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two reasons panels in the dashboard were empty:

1. Dashboard query typo. The softirq panel queried node_softirq_total
   (singular) but node_exporter exposes node_softirqs_total (plural).

2. node_exporter's --collector.netstat.fields default regex excludes
   TCPLostRetransmit and TCPSpuriousRTOs (and a number of other
   TcpExt_* fields). The collector reads them from /proc/net/netstat
   but drops them before exposing. Setting the filter to ^.*$ exposes
   the full set; cardinality bump on a runner is a few dozen series
   per node, negligible.

The remaining two empty panels (NET_RX softirq cross-check, EEE/LPI)
are expected to start populating once a runner is reprovisioned with
this scw.py — verifiable on the node via
  curl -s 127.0.0.1:9100/metrics | grep -E '^node_(softirqs|interrupts|ethtool)_'

https://claude.ai/code/session_01BgToCb8eDGsrkyTddCtt9Z
---
 dashboards/network-health-buckets-a-b.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dashboards/network-health-buckets-a-b.json b/dashboards/network-health-buckets-a-b.json
index 6dd8d3d..0338e44 100644
--- a/dashboards/network-health-buckets-a-b.json
+++ b/dashboards/network-health-buckets-a-b.json
@@ -113,7 +113,7 @@
                     "kind": "DataQuery",
                     "spec": {
                       "editorMode": "code",
-                      "expr": "rate(node_softirq_total{type=\"NET_RX\",node=~\"$node\"}[1m])",
+                      "expr": "rate(node_softirqs_total{type=\"NET_RX\",node=~\"$node\"}[1m])",
                       "legendFormat": "{{node}} cpu={{cpu}}",
                       "range": true
                     },

From f2fe6e4b2d1e0282ad7b7a2c4dc698b6fc653e80 Mon Sep 17 00:00:00 2001
From: Ludovic Henry <git@ludovic.dev>
Date: Sun, 10 May 2026 19:00:42 +0000
Subject: [PATCH 05/10] Fix EEE/LPI metric name for st_gmac driver

The light_dwmac_eth driver names the counter
irq_tx_path_in_lpi_mode_n, but st_gmac (the driver on the runner that
exported the metrics) names it irq_transmitted_path_in_lpi_mode_n.
The H7 sentinel panel was querying the light_dwmac form and getting
no data. Use the st_gmac form, which is the form actually exposed
on the fleet.

https://claude.ai/code/session_01BgToCb8eDGsrkyTddCtt9Z
---
 dashboards/network-health-buckets-a-b.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dashboards/network-health-buckets-a-b.json b/dashboards/network-health-buckets-a-b.json
index 0338e44..6e90caa 100644
--- a/dashboards/network-health-buckets-a-b.json
+++ b/dashboards/network-health-buckets-a-b.json
@@ -731,7 +731,7 @@
                     "kind": "DataQuery",
                     "spec": {
                       "editorMode": "code",
-                      "expr": "rate(node_ethtool_irq_tx_path_in_lpi_mode_n{device=\"end0\",node=~\"$node\"}[1m])",
+                      "expr": "rate(node_ethtool_irq_transmitted_path_in_lpi_mode_n{device=\"end0\",node=~\"$node\"}[1m])",
                       "legendFormat": "{{node}}",
                       "range": true
                     },

From d0154283f4338a5b0bc3cc6688b3341514de00b0 Mon Sep 17 00:00:00 2001
From: Ludovic Henry <git@ludovic.dev>
Date: Sun, 10 May 2026 19:17:52 +0000
Subject: [PATCH 06/10] Fix softirqs metric name and IRQ label selector

Two unrelated query fixes against current node_exporter (1.11.1):

1. node_exporter renamed the softirqs metric to
   node_softirqs_functions_total. The H10 NET_RX panel was querying
   the old node_softirqs_total. (Raw data already confirms H10:
   CPU0 has 1.55M NET_RX softirqs vs ~5K on each other CPU.)

2. The interrupts collector exposes the device name under the
   `devices` label, not `type` (which is the IRQ number). Filter
   end0 IRQs via devices=~".*end0.*"; legend now shows cpu, dev,
   and irq for clarity.

https://claude.ai/code/session_01BgToCb8eDGsrkyTddCtt9Z
---
 dashboards/network-health-buckets-a-b.json | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dashboards/network-health-buckets-a-b.json b/dashboards/network-health-buckets-a-b.json
index 6e90caa..b51b217 100644
--- a/dashboards/network-health-buckets-a-b.json
+++ b/dashboards/network-health-buckets-a-b.json
@@ -113,7 +113,7 @@
                     "kind": "DataQuery",
                     "spec": {
                       "editorMode": "code",
-                      "expr": "rate(node_softirqs_total{type=\"NET_RX\",node=~\"$node\"}[1m])",
+                      "expr": "rate(node_softirqs_functions_total{type=\"NET_RX\",node=~\"$node\"}[1m])",
                       "legendFormat": "{{node}} cpu={{cpu}}",
                       "range": true
                     },
@@ -180,8 +180,8 @@
                     "kind": "DataQuery",
                     "spec": {
                       "editorMode": "code",
-                      "expr": "rate(node_interrupts_total{type=~\".*end0.*\",node=~\"$node\"}[1m])",
-                      "legendFormat": "{{node}} cpu={{cpu}} type={{type}}",
+                      "expr": "rate(node_interrupts_total{devices=~\".*end0.*\",node=~\"$node\"}[1m])",
+                      "legendFormat": "{{node}} cpu={{cpu}} dev={{devices}} irq={{type}}",
                       "range": true
                     },
                     "version": "v0"

From e60a4471e6dadf1f8588833a73dd42d0dcc7d573 Mon Sep 17 00:00:00 2001
From: Ludovic Henry <git@ludovic.dev>
Date: Sun, 10 May 2026 19:26:59 +0000
Subject: [PATCH 07/10] Reorganise dashboard into per-hypothesis rows; add H1 +
 H5 panels
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Six rows:
  H1  Bad Fastly POP / IPv6 path                  3 new panels (12-14)
  H5  Scaleway WAN egress contention              2 new panels (15-16)
  H7  EEE / LPI micro-stalls (fallback)           panel 10
  H9  TCP-stack health                            panels 4-6
  H10 Single-core CPU0 softirq saturation         panels 1-3
  Sentinels (H2/H3, refuted)                      panels 7, 8, 11, 9

H2 (NIC errors), H3 (conntrack), H4 (PMTU), H6 (ASN throttle), H8
(in-host contention) don't get their own rows: H2/H3 are sentinels,
H4/H6/H8 have no on-host Prometheus metric (live capture or off-host
probe only).

H1/H5 panels read raw_github_probe_seconds, _bytes_per_second
(faceted by remote_ip/target) and runner_dns_resolved_ip; they
populate once raw_github_probe.py and dns_probe.py are running on
the node, otherwise the panels are empty placeholders waiting for
the probe data.

Layout uses RowsLayout containing one RowsLayoutRow per hypothesis,
each wrapping an AutoGridLayout with maxColumnCount=3 — schema
matches the v2 sample exported by Scaleway-managed Grafana.

https://claude.ai/code/session_01BgToCb8eDGsrkyTddCtt9Z
---
 dashboards/network-health-buckets-a-b.json | 529 ++++++++++++++++++++-
 1 file changed, 506 insertions(+), 23 deletions(-)

diff --git a/dashboards/network-health-buckets-a-b.json b/dashboards/network-health-buckets-a-b.json
index b51b217..f816c43 100644
--- a/dashboards/network-health-buckets-a-b.json
+++ b/dashboards/network-health-buckets-a-b.json
@@ -127,7 +127,7 @@
             "transformations": []
           }
         },
-        "description": "rate(node_softirq_total{type=\"NET_RX\"}). Cross-check for the CPU softirq% panel. Needs --collector.softirqs.",
+        "description": "rate(node_softirqs_functions_total{type=\"NET_RX\"}). Cross-check for the CPU softirq% panel. Needs --collector.softirqs.",
         "id": 2,
         "links": [],
         "title": "H10: NET_RX softirq deliveries/sec per CPU",
@@ -194,7 +194,7 @@
             "transformations": []
           }
         },
-        "description": "rate(node_interrupts_total{type=~\".*end0.*\"}). Single-queue light_dwmac_eth pins all end0 IRQs to one CPU; this should structurally show one line > 0 and the rest at 0. Needs --collector.interrupts.",
+        "description": "rate(node_interrupts_total{devices=~\".*end0.*\"}). Single-queue NIC pins all end0 IRQs to one CPU; this should structurally show one line > 0 and the rest at 0. Needs --collector.interrupts.",
         "id": 3,
         "links": [],
         "title": "H10: end0 hard IRQs/sec by CPU",
@@ -745,10 +745,10 @@
             "transformations": []
           }
         },
-        "description": "rate(node_ethtool_irq_tx_path_in_lpi_mode_n). LPI being entered constantly mostly means link is idle; only interesting if a slow-run window correlates with a marked change. Needs --collector.ethtool.",
+        "description": "rate(node_ethtool_irq_transmitted_path_in_lpi_mode_n). LPI being entered constantly mostly means link is idle; only interesting if a slow-run window correlates with a marked change. Needs --collector.ethtool.",
         "id": 10,
         "links": [],
-        "title": "Sentinel: EEE / LPI enter rate (H7 fallback)",
+        "title": "H7: EEE / LPI enter rate (fallback)",
         "vizConfig": {
           "group": "timeseries",
           "kind": "VizConfig",
@@ -868,27 +868,510 @@
           "version": "13.0.1"
         }
       }
+    },
+    "panel-12": {
+      "kind": "Panel",
+      "spec": {
+        "data": {
+          "kind": "QueryGroup",
+          "spec": {
+            "queries": [
+              {
+                "kind": "PanelQuery",
+                "spec": {
+                  "hidden": false,
+                  "query": {
+                    "datasource": { "name": "fflnugavx2h34c" },
+                    "group": "prometheus",
+                    "kind": "DataQuery",
+                    "spec": {
+                      "editorMode": "code",
+                      "expr": "raw_github_probe_seconds{target=\"raw.githubusercontent.com\",node=~\"$node\"}",
+                      "legendFormat": "{{node}} ip={{remote_ip}}",
+                      "range": true
+                    },
+                    "version": "v0"
+                  },
+                  "refId": "A"
+                }
+              }
+            ],
+            "queryOptions": {},
+            "transformations": []
+          }
+        },
+        "description": "raw_github_probe_seconds for the Fastly target, faceted by remote_ip. A specific Fastly IP cluster being slow while others are fast = H1 (POP steering).",
+        "id": 12,
+        "links": [],
+        "title": "H1: raw.githubusercontent.com — download time by Fastly IP",
+        "vizConfig": {
+          "group": "timeseries",
+          "kind": "VizConfig",
+          "spec": {
+            "fieldConfig": {
+              "defaults": {
+                "color": { "mode": "palette-classic" },
+                "custom": {
+                  "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto",
+                  "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none",
+                  "hideFrom": { "legend": false, "tooltip": false, "viz": false },
+                  "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5,
+                  "scaleDistribution": { "type": "linear" },
+                  "showPoints": "auto", "showValues": false, "spanNulls": false,
+                  "stacking": { "group": "A", "mode": "none" },
+                  "thresholdsStyle": { "mode": "off" }
+                },
+                "min": 0,
+                "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] },
+                "unit": "s"
+              },
+              "overrides": []
+            },
+            "options": {
+              "annotations": { "clustering": -1, "multiLane": false },
+              "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true },
+              "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" }
+            }
+          },
+          "version": "13.0.1"
+        }
+      }
+    },
+    "panel-13": {
+      "kind": "Panel",
+      "spec": {
+        "data": {
+          "kind": "QueryGroup",
+          "spec": {
+            "queries": [
+              {
+                "kind": "PanelQuery",
+                "spec": {
+                  "hidden": false,
+                  "query": {
+                    "datasource": { "name": "fflnugavx2h34c" },
+                    "group": "prometheus",
+                    "kind": "DataQuery",
+                    "spec": {
+                      "editorMode": "code",
+                      "expr": "raw_github_probe_bytes_per_second{target=\"raw.githubusercontent.com\",node=~\"$node\"}",
+                      "legendFormat": "{{node}} ip={{remote_ip}}",
+                      "range": true
+                    },
+                    "version": "v0"
+                  },
+                  "refId": "A"
+                }
+              }
+            ],
+            "queryOptions": {},
+            "transformations": []
+          }
+        },
+        "description": "raw_github_probe_bytes_per_second for the Fastly target, faceted by remote_ip.",
+        "id": 13,
+        "links": [],
+        "title": "H1: raw.githubusercontent.com — throughput by Fastly IP",
+        "vizConfig": {
+          "group": "timeseries",
+          "kind": "VizConfig",
+          "spec": {
+            "fieldConfig": {
+              "defaults": {
+                "color": { "mode": "palette-classic" },
+                "custom": {
+                  "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto",
+                  "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none",
+                  "hideFrom": { "legend": false, "tooltip": false, "viz": false },
+                  "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5,
+                  "scaleDistribution": { "type": "linear" },
+                  "showPoints": "auto", "showValues": false, "spanNulls": false,
+                  "stacking": { "group": "A", "mode": "none" },
+                  "thresholdsStyle": { "mode": "off" }
+                },
+                "min": 0,
+                "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] },
+                "unit": "Bps"
+              },
+              "overrides": []
+            },
+            "options": {
+              "annotations": { "clustering": -1, "multiLane": false },
+              "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true },
+              "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" }
+            }
+          },
+          "version": "13.0.1"
+        }
+      }
+    },
+    "panel-14": {
+      "kind": "Panel",
+      "spec": {
+        "data": {
+          "kind": "QueryGroup",
+          "spec": {
+            "queries": [
+              {
+                "kind": "PanelQuery",
+                "spec": {
+                  "hidden": false,
+                  "query": {
+                    "datasource": { "name": "fflnugavx2h34c" },
+                    "group": "prometheus",
+                    "kind": "DataQuery",
+                    "spec": {
+                      "editorMode": "code",
+                      "expr": "runner_dns_resolved_ip{host=\"raw.githubusercontent.com\",node=~\"$node\"}",
+                      "legendFormat": "{{node}} ip={{ip}}",
+                      "range": true
+                    },
+                    "version": "v0"
+                  },
+                  "refId": "A"
+                }
+              }
+            ],
+            "queryOptions": {},
+            "transformations": []
+          }
+        },
+        "description": "runner_dns_resolved_ip — info metric (1 per resolved IP). Shows which Fastly IPs each node sees over time. Cache-region flips show up as the set of IP lines changing.",
+        "id": 14,
+        "links": [],
+        "title": "H1: DNS resolutions for raw.githubusercontent.com",
+        "vizConfig": {
+          "group": "timeseries",
+          "kind": "VizConfig",
+          "spec": {
+            "fieldConfig": {
+              "defaults": {
+                "color": { "mode": "palette-classic" },
+                "custom": {
+                  "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto",
+                  "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 30, "gradientMode": "none",
+                  "hideFrom": { "legend": false, "tooltip": false, "viz": false },
+                  "insertNulls": false, "lineInterpolation": "stepAfter", "lineWidth": 1, "pointSize": 5,
+                  "scaleDistribution": { "type": "linear" },
+                  "showPoints": "auto", "showValues": false, "spanNulls": false,
+                  "stacking": { "group": "A", "mode": "none" },
+                  "thresholdsStyle": { "mode": "off" }
+                },
+                "min": 0,
+                "max": 1,
+                "thresholds": { "mode": "absolute", "steps": [ { "color": "blue", "value": 0 } ] },
+                "unit": "short"
+              },
+              "overrides": []
+            },
+            "options": {
+              "annotations": { "clustering": -1, "multiLane": false },
+              "legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "bottom", "showLegend": true },
+              "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" }
+            }
+          },
+          "version": "13.0.1"
+        }
+      }
+    },
+    "panel-15": {
+      "kind": "Panel",
+      "spec": {
+        "data": {
+          "kind": "QueryGroup",
+          "spec": {
+            "queries": [
+              {
+                "kind": "PanelQuery",
+                "spec": {
+                  "hidden": false,
+                  "query": {
+                    "datasource": { "name": "fflnugavx2h34c" },
+                    "group": "prometheus",
+                    "kind": "DataQuery",
+                    "spec": {
+                      "editorMode": "code",
+                      "expr": "raw_github_probe_seconds{target=\"raw.githubusercontent.com\",node=~\"$node\"}",
+                      "legendFormat": "{{node}} fastly",
+                      "range": true
+                    },
+                    "version": "v0"
+                  },
+                  "refId": "A"
+                }
+              },
+              {
+                "kind": "PanelQuery",
+                "spec": {
+                  "hidden": false,
+                  "query": {
+                    "datasource": { "name": "fflnugavx2h34c" },
+                    "group": "prometheus",
+                    "kind": "DataQuery",
+                    "spec": {
+                      "editorMode": "code",
+                      "expr": "raw_github_probe_seconds{target=\"cloudflare-control\",node=~\"$node\"}",
+                      "legendFormat": "{{node}} cloudflare",
+                      "range": true
+                    },
+                    "version": "v0"
+                  },
+                  "refId": "B"
+                }
+              }
+            ],
+            "queryOptions": {},
+            "transformations": []
+          }
+        },
+        "description": "raw_github_probe_seconds for both targets. When Fastly sags but Cloudflare stays flat → H9 (path-specific). When both sag together → H5 (Scaleway WAN egress).",
+        "id": 15,
+        "links": [],
+        "title": "H5: Probe download time — Fastly vs Cloudflare",
+        "vizConfig": {
+          "group": "timeseries",
+          "kind": "VizConfig",
+          "spec": {
+            "fieldConfig": {
+              "defaults": {
+                "color": { "mode": "palette-classic" },
+                "custom": {
+                  "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto",
+                  "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none",
+                  "hideFrom": { "legend": false, "tooltip": false, "viz": false },
+                  "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5,
+                  "scaleDistribution": { "type": "linear" },
+                  "showPoints": "auto", "showValues": false, "spanNulls": false,
+                  "stacking": { "group": "A", "mode": "none" },
+                  "thresholdsStyle": { "mode": "off" }
+                },
+                "min": 0,
+                "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] },
+                "unit": "s"
+              },
+              "overrides": []
+            },
+            "options": {
+              "annotations": { "clustering": -1, "multiLane": false },
+              "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true },
+              "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" }
+            }
+          },
+          "version": "13.0.1"
+        }
+      }
+    },
+    "panel-16": {
+      "kind": "Panel",
+      "spec": {
+        "data": {
+          "kind": "QueryGroup",
+          "spec": {
+            "queries": [
+              {
+                "kind": "PanelQuery",
+                "spec": {
+                  "hidden": false,
+                  "query": {
+                    "datasource": { "name": "fflnugavx2h34c" },
+                    "group": "prometheus",
+                    "kind": "DataQuery",
+                    "spec": {
+                      "editorMode": "code",
+                      "expr": "raw_github_probe_bytes_per_second{target=\"raw.githubusercontent.com\",node=~\"$node\"}",
+                      "legendFormat": "{{node}} fastly",
+                      "range": true
+                    },
+                    "version": "v0"
+                  },
+                  "refId": "A"
+                }
+              },
+              {
+                "kind": "PanelQuery",
+                "spec": {
+                  "hidden": false,
+                  "query": {
+                    "datasource": { "name": "fflnugavx2h34c" },
+                    "group": "prometheus",
+                    "kind": "DataQuery",
+                    "spec": {
+                      "editorMode": "code",
+                      "expr": "raw_github_probe_bytes_per_second{target=\"cloudflare-control\",node=~\"$node\"}",
+                      "legendFormat": "{{node}} cloudflare",
+                      "range": true
+                    },
+                    "version": "v0"
+                  },
+                  "refId": "B"
+                }
+              }
+            ],
+            "queryOptions": {},
+            "transformations": []
+          }
+        },
+        "description": "raw_github_probe_bytes_per_second for both targets. Same correlation logic as the time panel.",
+        "id": 16,
+        "links": [],
+        "title": "H5: Probe throughput — Fastly vs Cloudflare",
+        "vizConfig": {
+          "group": "timeseries",
+          "kind": "VizConfig",
+          "spec": {
+            "fieldConfig": {
+              "defaults": {
+                "color": { "mode": "palette-classic" },
+                "custom": {
+                  "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto",
+                  "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none",
+                  "hideFrom": { "legend": false, "tooltip": false, "viz": false },
+                  "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5,
+                  "scaleDistribution": { "type": "linear" },
+                  "showPoints": "auto", "showValues": false, "spanNulls": false,
+                  "stacking": { "group": "A", "mode": "none" },
+                  "thresholdsStyle": { "mode": "off" }
+                },
+                "min": 0,
+                "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] },
+                "unit": "Bps"
+              },
+              "overrides": []
+            },
+            "options": {
+              "annotations": { "clustering": -1, "multiLane": false },
+              "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true },
+              "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" }
+            }
+          },
+          "version": "13.0.1"
+        }
+      }
     }
   },
   "layout": {
-    "kind": "AutoGridLayout",
+    "kind": "RowsLayout",
     "spec": {
-      "columnWidthMode": "standard",
-      "items": [
-        { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-1" } } },
-        { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-2" } } },
-        { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-3" } } },
-        { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-4" } } },
-        { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-5" } } },
-        { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-6" } } },
-        { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-7" } } },
-        { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-8" } } },
-        { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-11" } } },
-        { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-9" } } },
-        { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-10" } } }
-      ],
-      "maxColumnCount": 3,
-      "rowHeightMode": "standard"
+      "rows": [
+        {
+          "kind": "RowsLayoutRow",
+          "spec": {
+            "collapse": false,
+            "title": "H1 — Bad Fastly POP / IPv6 path",
+            "layout": {
+              "kind": "AutoGridLayout",
+              "spec": {
+                "columnWidthMode": "standard",
+                "items": [
+                  { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-12" } } },
+                  { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-13" } } },
+                  { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-14" } } }
+                ],
+                "maxColumnCount": 3,
+                "rowHeightMode": "standard"
+              }
+            }
+          }
+        },
+        {
+          "kind": "RowsLayoutRow",
+          "spec": {
+            "collapse": false,
+            "title": "H5 — Scaleway WAN egress contention",
+            "layout": {
+              "kind": "AutoGridLayout",
+              "spec": {
+                "columnWidthMode": "standard",
+                "items": [
+                  { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-15" } } },
+                  { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-16" } } }
+                ],
+                "maxColumnCount": 3,
+                "rowHeightMode": "standard"
+              }
+            }
+          }
+        },
+        {
+          "kind": "RowsLayoutRow",
+          "spec": {
+            "collapse": false,
+            "title": "H7 — EEE / LPI micro-stalls (fallback)",
+            "layout": {
+              "kind": "AutoGridLayout",
+              "spec": {
+                "columnWidthMode": "standard",
+                "items": [
+                  { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-10" } } }
+                ],
+                "maxColumnCount": 3,
+                "rowHeightMode": "standard"
+              }
+            }
+          }
+        },
+        {
+          "kind": "RowsLayoutRow",
+          "spec": {
+            "collapse": false,
+            "title": "H9 — TCP-stack health (Scaleway↔Fastly path)",
+            "layout": {
+              "kind": "AutoGridLayout",
+              "spec": {
+                "columnWidthMode": "standard",
+                "items": [
+                  { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-4" } } },
+                  { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-5" } } },
+                  { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-6" } } }
+                ],
+                "maxColumnCount": 3,
+                "rowHeightMode": "standard"
+              }
+            }
+          }
+        },
+        {
+          "kind": "RowsLayoutRow",
+          "spec": {
+            "collapse": false,
+            "title": "H10 — Single-core CPU0 softirq saturation",
+            "layout": {
+              "kind": "AutoGridLayout",
+              "spec": {
+                "columnWidthMode": "standard",
+                "items": [
+                  { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-1" } } },
+                  { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-2" } } },
+                  { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-3" } } }
+                ],
+                "maxColumnCount": 3,
+                "rowHeightMode": "standard"
+              }
+            }
+          }
+        },
+        {
+          "kind": "RowsLayoutRow",
+          "spec": {
+            "collapse": false,
+            "title": "Sentinels (H2 / H3 refuted, monitoring)",
+            "layout": {
+              "kind": "AutoGridLayout",
+              "spec": {
+                "columnWidthMode": "standard",
+                "items": [
+                  { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-7" } } },
+                  { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-8" } } },
+                  { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-11" } } },
+                  { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-9" } } }
+                ],
+                "maxColumnCount": 3,
+                "rowHeightMode": "standard"
+              }
+            }
+          }
+        }
+      ]
     }
   },
   "links": [],
@@ -910,13 +1393,13 @@
     "timezone": "browser",
     "to": "now"
   },
-  "title": "RISE RISC-V runner — network health (Buckets A+B)",
+  "title": "RISE RISC-V runner — network health",
   "variables": [
     {
       "kind": "QueryVariable",
       "spec": {
         "allowCustomValue": false,
-        "current": { "text": ["$__all"], "value": ["$__all"] },
+        "current": { "text": "All", "value": ["$__all"] },
         "definition": "label_values(node)",
         "hide": "dontHide",
         "includeAll": true,

From f3dcae6d10be5a7d06afedfc70eaa42faefb8349 Mon Sep 17 00:00:00 2001
From: Ludovic Henry <git@ludovic.dev>
Date: Sun, 10 May 2026 19:37:09 +0000
Subject: [PATCH 08/10] Add NET_TX softirq panel to H10 row

Mirrors the existing NET_RX softirq panel against
node_softirqs_functions_total{type="NET_TX"}. Slots into the H10 row
between NET_RX softirq and end0 hard IRQs so RX/TX softirq pressure
sit side-by-side, with the hard-IRQ panel on the next visual row.

https://claude.ai/code/session_01BgToCb8eDGsrkyTddCtt9Z
---
 dashboards/network-health-buckets-a-b.json | 68 ++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/dashboards/network-health-buckets-a-b.json b/dashboards/network-health-buckets-a-b.json
index f816c43..3a0b516 100644
--- a/dashboards/network-health-buckets-a-b.json
+++ b/dashboards/network-health-buckets-a-b.json
@@ -1247,6 +1247,73 @@
           "version": "13.0.1"
         }
       }
+    },
+    "panel-17": {
+      "kind": "Panel",
+      "spec": {
+        "data": {
+          "kind": "QueryGroup",
+          "spec": {
+            "queries": [
+              {
+                "kind": "PanelQuery",
+                "spec": {
+                  "hidden": false,
+                  "query": {
+                    "datasource": { "name": "fflnugavx2h34c" },
+                    "group": "prometheus",
+                    "kind": "DataQuery",
+                    "spec": {
+                      "editorMode": "code",
+                      "expr": "rate(node_softirqs_functions_total{type=\"NET_TX\",node=~\"$node\"}[1m])",
+                      "legendFormat": "{{node}} cpu={{cpu}}",
+                      "range": true
+                    },
+                    "version": "v0"
+                  },
+                  "refId": "A"
+                }
+              }
+            ],
+            "queryOptions": {},
+            "transformations": []
+          }
+        },
+        "description": "rate(node_softirqs_functions_total{type=\"NET_TX\"}). TX-side softirq pressure per CPU; pair with the NET_RX panel for the full softirq picture on the link.",
+        "id": 17,
+        "links": [],
+        "title": "H10: NET_TX softirq deliveries/sec per CPU",
+        "vizConfig": {
+          "group": "timeseries",
+          "kind": "VizConfig",
+          "spec": {
+            "fieldConfig": {
+              "defaults": {
+                "color": { "mode": "palette-classic" },
+                "custom": {
+                  "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto",
+                  "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none",
+                  "hideFrom": { "legend": false, "tooltip": false, "viz": false },
+                  "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5,
+                  "scaleDistribution": { "type": "linear" },
+                  "showPoints": "auto", "showValues": false, "spanNulls": false,
+                  "stacking": { "group": "A", "mode": "none" },
+                  "thresholdsStyle": { "mode": "off" }
+                },
+                "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] },
+                "unit": "ops"
+              },
+              "overrides": []
+            },
+            "options": {
+              "annotations": { "clustering": -1, "multiLane": false },
+              "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true },
+              "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" }
+            }
+          },
+          "version": "13.0.1"
+        }
+      }
     }
   },
   "layout": {
@@ -1342,6 +1409,7 @@
                 "items": [
                   { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-1" } } },
                   { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-2" } } },
+                  { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-17" } } },
                   { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-3" } } }
                 ],
                 "maxColumnCount": 3,

From c8d10ef8856db723556f528add2b7e606e0778d6 Mon Sep 17 00:00:00 2001
From: Ludovic Henry <git@ludovic.dev>
Date: Sun, 10 May 2026 20:24:14 +0000
Subject: [PATCH 09/10] Refactor probe scripts to standalone files

The probes were embedded as bash heredocs inside SETUP_SCRIPT; move the
canonical sources to scripts/probes/raw_github_probe.py and
scripts/probes/dns_probe.py and have run_setup read and substitute
them in. Behaviour identical; SETUP_SCRIPT shrinks by ~140 lines.

https://claude.ai/code/session_01BgToCb8eDGsrkyTddCtt9Z
---
 scripts/probes/dns_probe.py        |  58 +++++++++++
 scripts/probes/raw_github_probe.py |  77 ++++++++++++++
 scripts/scw.py                     | 157 +++--------------------------
 3 files changed, 147 insertions(+), 145 deletions(-)
 create mode 100644 scripts/probes/dns_probe.py
 create mode 100644 scripts/probes/raw_github_probe.py

diff --git a/scripts/probes/dns_probe.py b/scripts/probes/dns_probe.py
new file mode 100644
index 0000000..62d0a5d
--- /dev/null
+++ b/scripts/probes/dns_probe.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+"""Snapshot which IPs raw.githubusercontent.com currently resolves to, for
+the node_exporter textfile collector.
+
+Resolves via socket.getaddrinfo (the libc resolver curl uses), so the IPs
+we record match the customer's real traffic path. Used to correlate slow
+windows with Fastly cache-region or POP flips (H1).
+"""
+
+import os
+import socket
+import tempfile
+from pathlib import Path
+
+OUT = Path("/var/lib/node_exporter/textfile_collector/dns_probe.prom")
+HOST = "raw.githubusercontent.com"
+
+HEADER = """\
+# HELP runner_dns_resolved_ip Info metric: 1 per IP currently resolved for HOST.
+# TYPE runner_dns_resolved_ip gauge
+# HELP runner_dns_resolved_ip_count Number of IPs returned for HOST.
+# TYPE runner_dns_resolved_ip_count gauge
+"""
+
+
+def resolve(host: str) -> list[str]:
+    ips: set[str] = set()
+    for family in (socket.AF_INET, socket.AF_INET6):
+        try:
+            ips.update(info[4][0] for info in socket.getaddrinfo(host, None, family, socket.SOCK_STREAM))
+        except socket.gaierror:
+            pass
+    return sorted(ips)
+
+
+def write_atomic(path: Path, content: str) -> None:
+    """Write content via tempfile + os.replace so node_exporter never sees a half-written file."""
+    path.parent.mkdir(parents=True, exist_ok=True)
+    fd, tmp = tempfile.mkstemp(dir=path.parent, prefix="." + path.name + ".")
+    try:
+        with os.fdopen(fd, "w") as f:
+            f.write(content)
+        os.replace(tmp, path)
+    except BaseException:
+        Path(tmp).unlink(missing_ok=True)
+        raise
+
+
+def main() -> None:
+    ips = resolve(HOST)
+    body = HEADER + "".join(
+        f'runner_dns_resolved_ip{{host="{HOST}",ip="{ip}"}} 1\n' for ip in ips
+    ) + f'runner_dns_resolved_ip_count{{host="{HOST}"}} {len(ips)}\n'
+    write_atomic(OUT, body)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/probes/raw_github_probe.py b/scripts/probes/raw_github_probe.py
new file mode 100644
index 0000000..246d2d3
--- /dev/null
+++ b/scripts/probes/raw_github_probe.py
@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+"""Probe raw.githubusercontent.com download speed for the node_exporter
+textfile collector.
+
+Hits the Fastly target the customer's CI actually downloads from and a
+non-Fastly comparison endpoint. When the Fastly target sags but the
+control stays flat → H9 (Scaleway↔Fastly path). When both sag → H5
+(Scaleway WAN egress).
+
+curl is shelled out so we get %{remote_ip} reflecting the IP libcurl
+actually connected to — matches the customer's real traffic path more
+faithfully than socket.gethostbyname would.
+"""
+
+import os
+import subprocess
+import tempfile
+from pathlib import Path
+
+OUT = Path("/var/lib/node_exporter/textfile_collector/raw_github_probe.prom")
+
+TARGETS = [
+    ("raw.githubusercontent.com",
+     "https://raw.githubusercontent.com/usnistgov/ACVP-Server/master/README.md"),
+    ("cloudflare-control",
+     "https://speed.cloudflare.com/__down?bytes=1048576"),
+]
+
+CURL_FORMAT = "%{time_total} %{speed_download} %{remote_ip} %{exitcode}\n"
+
+HEADER = """\
+# HELP raw_github_probe_seconds Wallclock to download a fixed test artefact.
+# TYPE raw_github_probe_seconds gauge
+# HELP raw_github_probe_bytes_per_second Average download throughput in bytes/sec.
+# TYPE raw_github_probe_bytes_per_second gauge
+# HELP raw_github_probe_curl_exit_code Curl exit code; 0 on success.
+# TYPE raw_github_probe_curl_exit_code gauge
+"""
+
+
+def probe(target: str, url: str) -> str:
+    """Run one curl, return Prom-formatted lines for the result."""
+    try:
+        result = subprocess.run(
+            ["curl", "-o", "/dev/null", "-s", "--max-time", "30", "-w", CURL_FORMAT, url],
+            capture_output=True, text=True, timeout=35,
+        )
+        fields = (result.stdout.strip() or "0 0 unknown 99").split()
+    except (subprocess.TimeoutExpired, FileNotFoundError):
+        fields = ["0", "0", "unknown", "99"]
+    seconds, bps, ip, code = (fields + ["0", "0", "unknown", "99"])[:4]
+    return (
+        f'raw_github_probe_seconds{{target="{target}",remote_ip="{ip}"}} {seconds}\n'
+        f'raw_github_probe_bytes_per_second{{target="{target}"}} {bps}\n'
+        f'raw_github_probe_curl_exit_code{{target="{target}"}} {code}\n'
+    )
+
+
+def write_atomic(path: Path, content: str) -> None:
+    """Write content via tempfile + os.replace so node_exporter never sees a half-written file."""
+    path.parent.mkdir(parents=True, exist_ok=True)
+    fd, tmp = tempfile.mkstemp(dir=path.parent, prefix="." + path.name + ".")
+    try:
+        with os.fdopen(fd, "w") as f:
+            f.write(content)
+        os.replace(tmp, path)
+    except BaseException:
+        Path(tmp).unlink(missing_ok=True)
+        raise
+
+
+def main() -> None:
+    write_atomic(OUT, HEADER + "".join(probe(name, url) for name, url in TARGETS))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/scw.py b/scripts/scw.py
index ff78161..62a1f45 100644
--- a/scripts/scw.py
+++ b/scripts/scw.py
@@ -867,155 +867,17 @@ def is_ready(res):
 ###############################################################################
 ## Install probe scripts (textfile collector)
 
-# Two scripts cover the metrics in the debugging plan that no built-in
-# node_exporter collector exposes:
-#   - raw_github_probe.py: end-to-end download timing against
-#     raw.githubusercontent.com plus a non-Fastly comparison target. The
-#     headline metric for the slow-CI investigation.
-#   - dns_probe.py: which Fastly IPs raw.githubusercontent.com is currently
-#     resolving to, so we can correlate slow windows with cache-region flips.
-#
-# Implemented in Python (stdlib only). curl is shelled out for the
-# raw_github probe so we get %{remote_ip} reflecting the IP libcurl
-# actually connected to (matches the customer's traffic path more
-# faithfully than what socket.gethostbyname would tell us).
-
-cat <<'SCRIPT_EOF' | sudo tee /usr/local/bin/raw_github_probe.py >/dev/null
-#!/usr/bin/env python3
-# Synthetic probe: download a fixed test artefact from
-# raw.githubusercontent.com and a non-Fastly comparison target,
-# emit timing metrics for the node_exporter textfile collector.
-import os
-import subprocess
-import tempfile
-from pathlib import Path
-
-OUT = Path("/var/lib/node_exporter/textfile_collector/raw_github_probe.prom")
-
-TARGETS = [
-    # Fastly target: small stable file in the same repo as the customer's
-    # slow downloads (they fetch from usnistgov/ACVP-Server).
-    ("raw.githubusercontent.com",
-     "https://raw.githubusercontent.com/usnistgov/ACVP-Server/master/README.md"),
-    # Non-Fastly comparison: Cloudflare's well-known speed-test endpoint.
-    # When the Fastly target sags but this stays flat, the issue is on the
-    # Scaleway-Fastly path (H9), not Scaleway WAN egress in general (H5).
-    ("cloudflare-control",
-     "https://speed.cloudflare.com/__down?bytes=1048576"),
-]
-
-
-def probe(target: str, url: str) -> list[str]:
-    try:
-        result = subprocess.run(
-            [
-                "curl", "-o", "/dev/null", "-s", "--max-time", "30",
-                "-w", "%{time_total} %{speed_download} %{remote_ip} %{exitcode}\n",
-                url,
-            ],
-            capture_output=True, text=True, timeout=35,
-        )
-        fields = (result.stdout.strip() or "0 0 unknown 99").split()
-    except (subprocess.TimeoutExpired, FileNotFoundError):
-        fields = ["0", "0", "unknown", "99"]
-    fields = (fields + ["0", "0", "unknown", "99"])[:4]
-    time_total, speed, remote_ip, exit_code = fields
-    return [
-        f'raw_github_probe_seconds{{target="{target}",remote_ip="{remote_ip}"}} {time_total}',
-        f'raw_github_probe_bytes_per_second{{target="{target}"}} {speed}',
-        f'raw_github_probe_curl_exit_code{{target="{target}"}} {exit_code}',
-    ]
-
-
-def main() -> None:
-    lines = [
-        "# HELP raw_github_probe_seconds Wallclock to download a fixed test artefact.",
-        "# TYPE raw_github_probe_seconds gauge",
-        "# HELP raw_github_probe_bytes_per_second Average download throughput in bytes/sec.",
-        "# TYPE raw_github_probe_bytes_per_second gauge",
-        "# HELP raw_github_probe_curl_exit_code Curl exit code; 0 on success.",
-        "# TYPE raw_github_probe_curl_exit_code gauge",
-    ]
-    for target, url in TARGETS:
-        lines.extend(probe(target, url))
-
-    OUT.parent.mkdir(parents=True, exist_ok=True)
-    fd, tmp = tempfile.mkstemp(dir=str(OUT.parent), prefix=".raw_github_probe.")
-    try:
-        with os.fdopen(fd, "w") as f:
-            f.write("\n".join(lines) + "\n")
-        os.replace(tmp, OUT)
-    except Exception:
-        try:
-            os.unlink(tmp)
-        except FileNotFoundError:
-            pass
-        raise
-
+# Sources live in scripts/probes/; substituted in by run_setup().
 
-if __name__ == "__main__":
-    main()
-SCRIPT_EOF
+cat <<'PROBE_EOF' | sudo tee /usr/local/bin/raw_github_probe.py >/dev/null
+@@RAW_GITHUB_PROBE_PY@@
+PROBE_EOF
 sudo chmod 0755 /usr/local/bin/raw_github_probe.py
 sudo chown root:root /usr/local/bin/raw_github_probe.py
 
-cat <<'SCRIPT_EOF' | sudo tee /usr/local/bin/dns_probe.py >/dev/null
-#!/usr/bin/env python3
-# DNS resolution snapshot for raw.githubusercontent.com — emit info-style
-# metrics so we can see which Fastly IPs each node sees over time.
-#
-# Resolves via socket.getaddrinfo, which uses the same resolver libc
-# would use, so the IPs we record are the same ones curl/the runner
-# agent would actually connect to.
-import os
-import socket
-import tempfile
-from pathlib import Path
-
-OUT = Path("/var/lib/node_exporter/textfile_collector/dns_probe.prom")
-HOST = "raw.githubusercontent.com"
-
-
-def resolve(host: str) -> list[str]:
-    ips: set[str] = set()
-    for family in (socket.AF_INET, socket.AF_INET6):
-        try:
-            for info in socket.getaddrinfo(host, None, family, socket.SOCK_STREAM):
-                ips.add(info[4][0])
-        except socket.gaierror:
-            pass
-    return sorted(ips)
-
-
-def main() -> None:
-    ips = resolve(HOST)
-    lines = [
-        "# HELP runner_dns_resolved_ip Info metric: 1 per IP currently resolved for HOST.",
-        "# TYPE runner_dns_resolved_ip gauge",
-        "# HELP runner_dns_resolved_ip_count Number of IPs returned for HOST.",
-        "# TYPE runner_dns_resolved_ip_count gauge",
-    ]
-    for ip in ips:
-        lines.append(f'runner_dns_resolved_ip{{host="{HOST}",ip="{ip}"}} 1')
-    lines.append(f'runner_dns_resolved_ip_count{{host="{HOST}"}} {len(ips)}')
-
-    OUT.parent.mkdir(parents=True, exist_ok=True)
-    fd, tmp = tempfile.mkstemp(dir=str(OUT.parent), prefix=".dns_probe.")
-    try:
-        with os.fdopen(fd, "w") as f:
-            f.write("\n".join(lines) + "\n")
-        os.replace(tmp, OUT)
-    except Exception:
-        try:
-            os.unlink(tmp)
-        except FileNotFoundError:
-            pass
-        raise
-
-
-if __name__ == "__main__":
-    main()
-SCRIPT_EOF
+cat <<'PROBE_EOF' | sudo tee /usr/local/bin/dns_probe.py >/dev/null
+@@DNS_PROBE_PY@@
+PROBE_EOF
 sudo chmod 0755 /usr/local/bin/dns_probe.py
 sudo chown root:root /usr/local/bin/dns_probe.py
 
@@ -1269,8 +1131,13 @@ def create_cockpit_metrics_push_token(name: str) -> Token:
 def setup_runner(ssh, runner, pn):
     cockpit_metrics_ds = get_or_create_cockpit_metrics_data_source()
     cockpit_metrics_token = create_cockpit_metrics_push_token(f"{runner}-metrics-token")
+    probes_dir = os.path.join(os.path.dirname(__file__), "probes")
+    raw_github_probe_py = open(os.path.join(probes_dir, "raw_github_probe.py")).read()
+    dns_probe_py = open(os.path.join(probes_dir, "dns_probe.py")).read()
     script = SETUP_SCRIPT.replace("@@COCKPIT_METRICS_PUSH_URL@@", cockpit_metrics_ds.url) \
                          .replace("@@COCKPIT_METRICS_TOKEN@@", cockpit_metrics_token.secret_key) \
+                         .replace("@@RAW_GITHUB_PROBE_PY@@", raw_github_probe_py) \
+                         .replace("@@DNS_PROBE_PY@@", dns_probe_py) \
                          #FIXME(pn): enable private address again
                          # .replace("@@PN_IP@@", pn.ip)
                          # .replace("@@PN_VLAN_ID@@", pn.vlan_id)

From 5289a3fc02b9743d3635e82b5277821107052119 Mon Sep 17 00:00:00 2001
From: Ludovic Henry <git@ludovic.dev>
Date: Sun, 10 May 2026 22:04:36 +0000
Subject: [PATCH 10/10] Add three diagnostic panels for next TCPTimeouts
 incident
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

  H9: Spurious-RTO ratio (panel 18)
    TCPSpuriousRTOs / TCPTimeouts. ~1.0 means most RTOs are spurious
    (network was fine, kernel was impatient — rto_min tuning); ~0
    means genuine packet loss.

  H9: Fast recovery vs RTO (panel 19)
    TCPSackRecovery + TCPRenoRecovery (cheap, cwnd preserved) vs
    TCPTimeouts (expensive, cwnd collapsed). When RTO dominates the
    loss pattern is severe/bursty.

  H10: softnet drops & NAPI squeezes per CPU (panel 20)
    rate(node_softnet_dropped_total) and _times_squeezed_total. If
    times_squeezed > 0 on the CPU that handles end0 IRQs, the NAPI
    poll exhausted its budget and the NIC dropped at the ring buffer
    before TCP saw it — connects H10 (CPU0 softirq saturation) to
    H9 (downstream TCP timeouts).

https://claude.ai/code/session_01BgToCb8eDGsrkyTddCtt9Z
---
 dashboards/network-health-buckets-a-b.json | 269 ++++++++++++++++++++-
 1 file changed, 267 insertions(+), 2 deletions(-)

diff --git a/dashboards/network-health-buckets-a-b.json b/dashboards/network-health-buckets-a-b.json
index 3a0b516..9117752 100644
--- a/dashboards/network-health-buckets-a-b.json
+++ b/dashboards/network-health-buckets-a-b.json
@@ -1314,6 +1314,268 @@
           "version": "13.0.1"
         }
       }
+    },
+    "panel-18": {
+      "kind": "Panel",
+      "spec": {
+        "data": {
+          "kind": "QueryGroup",
+          "spec": {
+            "queries": [
+              {
+                "kind": "PanelQuery",
+                "spec": {
+                  "hidden": false,
+                  "query": {
+                    "datasource": { "name": "fflnugavx2h34c" },
+                    "group": "prometheus",
+                    "kind": "DataQuery",
+                    "spec": {
+                      "editorMode": "code",
+                      "expr": "rate(node_netstat_TcpExt_TCPSpuriousRTOs{node=~\"$node\"}[5m]) / clamp_min(rate(node_netstat_TcpExt_TCPTimeouts{node=~\"$node\"}[5m]), 1)",
+                      "legendFormat": "{{node}}",
+                      "range": true
+                    },
+                    "version": "v0"
+                  },
+                  "refId": "A"
+                }
+              }
+            ],
+            "queryOptions": {},
+            "transformations": []
+          }
+        },
+        "description": "TCPSpuriousRTOs / TCPTimeouts. Close to 1.0 = most RTOs were spurious (network was fine, kernel was impatient — RTO-min tuning territory). Close to 0 = real loss.",
+        "id": 18,
+        "links": [],
+        "title": "H9: Spurious-RTO ratio (spurious / total RTO)",
+        "vizConfig": {
+          "group": "timeseries",
+          "kind": "VizConfig",
+          "spec": {
+            "fieldConfig": {
+              "defaults": {
+                "color": { "mode": "palette-classic" },
+                "custom": {
+                  "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto",
+                  "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none",
+                  "hideFrom": { "legend": false, "tooltip": false, "viz": false },
+                  "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5,
+                  "scaleDistribution": { "type": "linear" },
+                  "showPoints": "auto", "showValues": false, "spanNulls": false,
+                  "stacking": { "group": "A", "mode": "none" },
+                  "thresholdsStyle": { "mode": "off" }
+                },
+                "min": 0,
+                "max": 1,
+                "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] },
+                "unit": "percentunit"
+              },
+              "overrides": []
+            },
+            "options": {
+              "annotations": { "clustering": -1, "multiLane": false },
+              "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "list", "placement": "bottom", "showLegend": true },
+              "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" }
+            }
+          },
+          "version": "13.0.1"
+        }
+      }
+    },
+    "panel-19": {
+      "kind": "Panel",
+      "spec": {
+        "data": {
+          "kind": "QueryGroup",
+          "spec": {
+            "queries": [
+              {
+                "kind": "PanelQuery",
+                "spec": {
+                  "hidden": false,
+                  "query": {
+                    "datasource": { "name": "fflnugavx2h34c" },
+                    "group": "prometheus",
+                    "kind": "DataQuery",
+                    "spec": {
+                      "editorMode": "code",
+                      "expr": "rate(node_netstat_TcpExt_TCPSackRecovery{node=~\"$node\"}[5m])",
+                      "legendFormat": "{{node}} sack-recovery",
+                      "range": true
+                    },
+                    "version": "v0"
+                  },
+                  "refId": "A"
+                }
+              },
+              {
+                "kind": "PanelQuery",
+                "spec": {
+                  "hidden": false,
+                  "query": {
+                    "datasource": { "name": "fflnugavx2h34c" },
+                    "group": "prometheus",
+                    "kind": "DataQuery",
+                    "spec": {
+                      "editorMode": "code",
+                      "expr": "rate(node_netstat_TcpExt_TCPRenoRecovery{node=~\"$node\"}[5m])",
+                      "legendFormat": "{{node}} reno-recovery",
+                      "range": true
+                    },
+                    "version": "v0"
+                  },
+                  "refId": "B"
+                }
+              },
+              {
+                "kind": "PanelQuery",
+                "spec": {
+                  "hidden": false,
+                  "query": {
+                    "datasource": { "name": "fflnugavx2h34c" },
+                    "group": "prometheus",
+                    "kind": "DataQuery",
+                    "spec": {
+                      "editorMode": "code",
+                      "expr": "rate(node_netstat_TcpExt_TCPTimeouts{node=~\"$node\"}[5m])",
+                      "legendFormat": "{{node}} rto",
+                      "range": true
+                    },
+                    "version": "v0"
+                  },
+                  "refId": "C"
+                }
+              }
+            ],
+            "queryOptions": {},
+            "transformations": []
+          }
+        },
+        "description": "TCPSackRecovery + TCPRenoRecovery (fast recovery, cwnd preserved) vs TCPTimeouts (RTO, cwnd collapsed). Healthy: fast dominates. Degraded: RTO dominates → severe / bursty loss.",
+        "id": 19,
+        "links": [],
+        "title": "H9: Fast recovery vs RTO",
+        "vizConfig": {
+          "group": "timeseries",
+          "kind": "VizConfig",
+          "spec": {
+            "fieldConfig": {
+              "defaults": {
+                "color": { "mode": "palette-classic" },
+                "custom": {
+                  "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto",
+                  "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none",
+                  "hideFrom": { "legend": false, "tooltip": false, "viz": false },
+                  "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5,
+                  "scaleDistribution": { "type": "linear" },
+                  "showPoints": "auto", "showValues": false, "spanNulls": false,
+                  "stacking": { "group": "A", "mode": "none" },
+                  "thresholdsStyle": { "mode": "off" }
+                },
+                "min": 0,
+                "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] },
+                "unit": "ops"
+              },
+              "overrides": []
+            },
+            "options": {
+              "annotations": { "clustering": -1, "multiLane": false },
+              "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true },
+              "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" }
+            }
+          },
+          "version": "13.0.1"
+        }
+      }
+    },
+    "panel-20": {
+      "kind": "Panel",
+      "spec": {
+        "data": {
+          "kind": "QueryGroup",
+          "spec": {
+            "queries": [
+              {
+                "kind": "PanelQuery",
+                "spec": {
+                  "hidden": false,
+                  "query": {
+                    "datasource": { "name": "fflnugavx2h34c" },
+                    "group": "prometheus",
+                    "kind": "DataQuery",
+                    "spec": {
+                      "editorMode": "code",
+                      "expr": "rate(node_softnet_dropped_total{node=~\"$node\"}[1m])",
+                      "legendFormat": "{{node}} cpu={{cpu}} dropped",
+                      "range": true
+                    },
+                    "version": "v0"
+                  },
+                  "refId": "A"
+                }
+              },
+              {
+                "kind": "PanelQuery",
+                "spec": {
+                  "hidden": false,
+                  "query": {
+                    "datasource": { "name": "fflnugavx2h34c" },
+                    "group": "prometheus",
+                    "kind": "DataQuery",
+                    "spec": {
+                      "editorMode": "code",
+                      "expr": "rate(node_softnet_times_squeezed_total{node=~\"$node\"}[1m])",
+                      "legendFormat": "{{node}} cpu={{cpu}} squeezed",
+                      "range": true
+                    },
+                    "version": "v0"
+                  },
+                  "refId": "B"
+                }
+              }
+            ],
+            "queryOptions": {},
+            "transformations": []
+          }
+        },
+        "description": "node_softnet_dropped_total and _times_squeezed_total per CPU. squeezed > 0 on the CPU handling end0 IRQs = NAPI poll exhausted its budget; the NIC then drops at the ring buffer before TCP sees it. Connects H10 (softirq saturation) to H9 (downstream TCP timeouts).",
+        "id": 20,
+        "links": [],
+        "title": "H10: softnet drops & NAPI squeezes per CPU",
+        "vizConfig": {
+          "group": "timeseries",
+          "kind": "VizConfig",
+          "spec": {
+            "fieldConfig": {
+              "defaults": {
+                "color": { "mode": "palette-classic" },
+                "custom": {
+                  "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto",
+                  "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none",
+                  "hideFrom": { "legend": false, "tooltip": false, "viz": false },
+                  "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5,
+                  "scaleDistribution": { "type": "linear" },
+                  "showPoints": "auto", "showValues": false, "spanNulls": false,
+                  "stacking": { "group": "A", "mode": "none" },
+                  "thresholdsStyle": { "mode": "off" }
+                },
+                "min": 0,
+                "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] },
+                "unit": "ops"
+              },
+              "overrides": []
+            },
+            "options": {
+              "annotations": { "clustering": -1, "multiLane": false },
+              "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true },
+              "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" }
+            }
+          },
+          "version": "13.0.1"
+        }
+      }
     }
   },
   "layout": {
@@ -1389,7 +1651,9 @@
                 "items": [
                   { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-4" } } },
                   { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-5" } } },
-                  { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-6" } } }
+                  { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-6" } } },
+                  { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-18" } } },
+                  { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-19" } } }
                 ],
                 "maxColumnCount": 3,
                 "rowHeightMode": "standard"
@@ -1410,7 +1674,8 @@
                   { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-1" } } },
                   { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-2" } } },
                   { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-17" } } },
-                  { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-3" } } }
+                  { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-3" } } },
+                  { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-20" } } }
                 ],
                 "maxColumnCount": 3,
                 "rowHeightMode": "standard"