diff --git a/dashboards/network-health-buckets-a-b.json b/dashboards/network-health-buckets-a-b.json new file mode 100644 index 0000000..9117752 --- /dev/null +++ b/dashboards/network-health-buckets-a-b.json @@ -0,0 +1,1762 @@ +{ + "annotations": [ + { + "kind": "AnnotationQuery", + "spec": { + "builtIn": true, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "query": { + "datasource": { "name": "-- Grafana --" }, + "group": "grafana", + "kind": "DataQuery", + "spec": {}, + "version": "v0" + } + } + } + ], + "cursorSync": "Off", + "editable": true, + "elements": { + "panel-1": { + "kind": "Panel", + "spec": { + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "rate(node_cpu_seconds_total{mode=\"softirq\",node=~\"$node\"}[1m])", + "legendFormat": "{{node}} cpu={{cpu}}", + "range": true + }, + "version": "v0" + }, + "refId": "A" + } + } + ], + "queryOptions": {}, + "transformations": [] + } + }, + "description": "rate(node_cpu_seconds_total{mode=\"softirq\"}). H10 primary signal — CPU0 approaching 1.0 while others stay near 0 = saturation. Threshold line at 0.9.", + "id": 1, + "links": [], + "title": "H10: CPU softirq% by CPU", + "vizConfig": { + "group": "timeseries", + "kind": "VizConfig", + "spec": { + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", + "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", "showValues": false, "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "line" } + }, + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": 0 }, + { "color": "red", "value": 0.9 } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "options": { + "annotations": { "clustering": -1, "multiLane": false }, + "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } + } + }, + "version": "13.0.1" + } + } + }, + "panel-2": { + "kind": "Panel", + "spec": { + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "rate(node_softirqs_functions_total{type=\"NET_RX\",node=~\"$node\"}[1m])", + "legendFormat": "{{node}} cpu={{cpu}}", + "range": true + }, + "version": "v0" + }, + "refId": "A" + } + } + ], + "queryOptions": {}, + "transformations": [] + } + }, + "description": "rate(node_softirqs_functions_total{type=\"NET_RX\"}). Cross-check for the CPU softirq% panel. Needs --collector.softirqs.", + "id": 2, + "links": [], + "title": "H10: NET_RX softirq deliveries/sec per CPU", + "vizConfig": { + "group": "timeseries", + "kind": "VizConfig", + "spec": { + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", + "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", "showValues": false, "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] }, + "unit": "ops" + }, + "overrides": [] + }, + "options": { + "annotations": { "clustering": -1, "multiLane": false }, + "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } + } + }, + "version": "13.0.1" + } + } + }, + "panel-3": { + "kind": "Panel", + "spec": { + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "rate(node_interrupts_total{devices=~\".*end0.*\",node=~\"$node\"}[1m])", + "legendFormat": "{{node}} cpu={{cpu}} dev={{devices}} irq={{type}}", + "range": true + }, + "version": "v0" + }, + "refId": "A" + } + } + ], + "queryOptions": {}, + "transformations": [] + } + }, + "description": "rate(node_interrupts_total{devices=~\".*end0.*\"}). Single-queue NIC pins all end0 IRQs to one CPU; this should structurally show one line > 0 and the rest at 0. Needs --collector.interrupts.", + "id": 3, + "links": [], + "title": "H10: end0 hard IRQs/sec by CPU", + "vizConfig": { + "group": "timeseries", + "kind": "VizConfig", + "spec": { + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", + "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", "showValues": false, "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] }, + "unit": "ops" + }, + "overrides": [] + }, + "options": { + "annotations": { "clustering": -1, "multiLane": false }, + "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } + } + }, + "version": "13.0.1" + } + } + }, + "panel-4": { + "kind": "Panel", + "spec": { + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "rate(node_netstat_TcpExt_TCPTimeouts{node=~\"$node\"}[5m])", + "legendFormat": "{{node}}", + "range": true + }, + "version": "v0" + }, + "refId": "A" + } + } + ], + "queryOptions": {}, + "transformations": [] + } + }, + "description": "rate(node_netstat_TcpExt_TCPTimeouts). Each timeout = up to 120 s of RTO backoff. H9 primary.", + "id": 4, + "links": [], + "title": "H9: TCPTimeouts/sec", + "vizConfig": { + "group": "timeseries", + "kind": "VizConfig", + "spec": { + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", + "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", "showValues": false, "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "min": 0, + "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] }, + "unit": "ops" + }, + "overrides": [] + }, + "options": { + "annotations": { "clustering": -1, "multiLane": false }, + "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } + } + }, + "version": "13.0.1" + } + } + }, + "panel-5": { + "kind": "Panel", + "spec": { + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "rate(node_netstat_Tcp_RetransSegs{node=~\"$node\"}[5m]) / clamp_min(rate(node_netstat_Tcp_OutSegs{node=~\"$node\"}[5m]), 1)", + "legendFormat": "{{node}}", + "range": true + }, + "version": "v0" + }, + "refId": "A" + } + } + ], + "queryOptions": {}, + "transformations": [] + } + }, + "description": "Healthy < 0.5 %; threshold line at 0.005.", + "id": 5, + "links": [], + "title": "H9: Retransmit ratio (RetransSegs / OutSegs)", + "vizConfig": { + "group": "timeseries", + "kind": "VizConfig", + "spec": { + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", + "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", "showValues": false, "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "line" } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": 0 }, + { "color": "red", "value": 0.005 } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "options": { + "annotations": { "clustering": -1, "multiLane": false }, + "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } + } + }, + "version": "13.0.1" + } + } + }, + "panel-6": { + "kind": "Panel", + "spec": { + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "rate(node_netstat_TcpExt_TCPLostRetransmit{node=~\"$node\"}[5m])", + "legendFormat": "{{node}} lost-retransmit", + "range": true + }, + "version": "v0" + }, + "refId": "A" + } + }, + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "rate(node_netstat_TcpExt_TCPSpuriousRTOs{node=~\"$node\"}[5m])", + "legendFormat": "{{node}} spurious-rto", + "range": true + }, + "version": "v0" + }, + "refId": "B" + } + } + ], + "queryOptions": {}, + "transformations": [] + } + }, + "description": "TCPLostRetransmit and TCPSpuriousRTOs. Distinguishes real loss from spurious RTO detection.", + "id": 6, + "links": [], + "title": "H9: Lost retransmits & spurious RTOs", + "vizConfig": { + "group": "timeseries", + "kind": "VizConfig", + "spec": { + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", + "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", "showValues": false, "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "min": 0, + "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] }, + "unit": "ops" + }, + "overrides": [] + }, + "options": { + "annotations": { "clustering": -1, "multiLane": false }, + "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } + } + }, + "version": "13.0.1" + } + } + }, + "panel-7": { + "kind": "Panel", + "spec": { + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "rate(node_network_receive_bytes_total{device=\"end0\",node=~\"$node\"}[1m]) * 8 / 1e6", + "legendFormat": "{{node}} rx", + "range": true + }, + "version": "v0" + }, + "refId": "A" + } + }, + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "rate(node_network_transmit_bytes_total{device=\"end0\",node=~\"$node\"}[1m]) * 8 / 1e6", + "legendFormat": "{{node}} tx", + "range": true + }, + "version": "v0" + }, + "refId": "B" + } + } + ], + "queryOptions": {}, + "transformations": [] + } + }, + "description": "rate(node_network_receive/transmit_bytes_total{device=\"end0\"}) * 8 / 1e6. Threshold at 90 Mbps (10 % shy of the 100 Mbps line).", + "id": 7, + "links": [], + "title": "Sentinel: end0 throughput (Mbps) vs 100 Mbps line", + "vizConfig": { + "group": "timeseries", + "kind": "VizConfig", + "spec": { + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", + "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", "showValues": false, "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "line" } + }, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": 0 }, + { "color": "orange", "value": 90 } + ] + }, + "unit": "Mbits" + }, + "overrides": [] + }, + "options": { + "annotations": { "clustering": -1, "multiLane": false }, + "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } + } + }, + "version": "13.0.1" + } + } + }, + "panel-8": { + "kind": "Panel", + "spec": { + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "rate(node_network_receive_drop_total{device=\"end0\",node=~\"$node\"}[5m])", + "legendFormat": "{{node}} rx-drop", + "range": true + }, + "version": "v0" + }, + "refId": "A" + } + }, + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "rate(node_network_receive_errs_total{device=\"end0\",node=~\"$node\"}[5m])", + "legendFormat": "{{node}} rx-errs", + "range": true + }, + "version": "v0" + }, + "refId": "B" + } + } + ], + "queryOptions": {}, + "transformations": [] + } + }, + "description": "rate(node_network_receive_{drop,errs}_total{device=\"end0\"}). Should stay flat at 0; any climb on the receive path reopens H2.", + "id": 8, + "links": [], + "title": "Sentinel: end0 RX drops & errors (H2)", + "vizConfig": { + "group": "timeseries", + "kind": "VizConfig", + "spec": { + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", + "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", "showValues": false, "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "min": 0, + "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] }, + "unit": "ops" + }, + "overrides": [] + }, + "options": { + "annotations": { "clustering": -1, "multiLane": false }, + "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } + } + }, + "version": "13.0.1" + } + } + }, + "panel-9": { + "kind": "Panel", + "spec": { + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "node_nf_conntrack_entries{node=~\"$node\"} / node_nf_conntrack_entries_limit{node=~\"$node\"}", + "legendFormat": "{{node}}", + "range": true + }, + "version": "v0" + }, + "refId": "A" + } + } + ], + "queryOptions": {}, + "transformations": [] + } + }, + "description": "node_nf_conntrack_entries / _limit. Should stay << 0.01 on a healthy runner; H3 fully refuted on riscv-runner-20.", + "id": 9, + "links": [], + "title": "Sentinel: Conntrack utilisation (H3)", + "vizConfig": { + "group": "timeseries", + "kind": "VizConfig", + "spec": { + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", + "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", "showValues": false, "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "line" } + }, + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { "color": "green", "value": 0 }, + { "color": "red", "value": 0.5 } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "options": { + "annotations": { "clustering": -1, "multiLane": false }, + "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } + } + }, + "version": "13.0.1" + } + } + }, + "panel-10": { + "kind": "Panel", + "spec": { + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "rate(node_ethtool_irq_transmitted_path_in_lpi_mode_n{device=\"end0\",node=~\"$node\"}[1m])", + "legendFormat": "{{node}}", + "range": true + }, + "version": "v0" + }, + "refId": "A" + } + } + ], + "queryOptions": {}, + "transformations": [] + } + }, + "description": "rate(node_ethtool_irq_transmitted_path_in_lpi_mode_n). LPI being entered constantly mostly means link is idle; only interesting if a slow-run window correlates with a marked change. Needs --collector.ethtool.", + "id": 10, + "links": [], + "title": "H7: EEE / LPI enter rate (fallback)", + "vizConfig": { + "group": "timeseries", + "kind": "VizConfig", + "spec": { + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", + "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", "showValues": false, "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "min": 0, + "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] }, + "unit": "ops" + }, + "overrides": [] + }, + "options": { + "annotations": { "clustering": -1, "multiLane": false }, + "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } + } + }, + "version": "13.0.1" + } + } + }, + "panel-11": { + "kind": "Panel", + "spec": { + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "rate(node_network_transmit_drop_total{device=\"end0\",node=~\"$node\"}[5m])", + "legendFormat": "{{node}} tx-drop", + "range": true + }, + "version": "v0" + }, + "refId": "A" + } + }, + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "rate(node_network_transmit_errs_total{device=\"end0\",node=~\"$node\"}[5m])", + "legendFormat": "{{node}} tx-errs", + "range": true + }, + "version": "v0" + }, + "refId": "B" + } + } + ], + "queryOptions": {}, + "transformations": [] + } + }, + "description": "rate(node_network_transmit_{drop,errs}_total{device=\"end0\"}). Should stay flat at 0; any climb on the transmit path means egress trouble.", + "id": 11, + "links": [], + "title": "Sentinel: end0 TX drops & errors", + "vizConfig": { + "group": "timeseries", + "kind": "VizConfig", + "spec": { + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", + "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", "showValues": false, "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "min": 0, + "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] }, + "unit": "ops" + }, + "overrides": [] + }, + "options": { + "annotations": { "clustering": -1, "multiLane": false }, + "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } + } + }, + "version": "13.0.1" + } + } + }, + "panel-12": { + "kind": "Panel", + "spec": { + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "raw_github_probe_seconds{target=\"raw.githubusercontent.com\",node=~\"$node\"}", + "legendFormat": "{{node}} ip={{remote_ip}}", + "range": true + }, + "version": "v0" + }, + "refId": "A" + } + } + ], + "queryOptions": {}, + "transformations": [] + } + }, + "description": "raw_github_probe_seconds for the Fastly target, faceted by remote_ip. A specific Fastly IP cluster being slow while others are fast = H1 (POP steering).", + "id": 12, + "links": [], + "title": "H1: raw.githubusercontent.com — download time by Fastly IP", + "vizConfig": { + "group": "timeseries", + "kind": "VizConfig", + "spec": { + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", + "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", "showValues": false, "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "min": 0, + "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] }, + "unit": "s" + }, + "overrides": [] + }, + "options": { + "annotations": { "clustering": -1, "multiLane": false }, + "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } + } + }, + "version": "13.0.1" + } + } + }, + "panel-13": { + "kind": "Panel", + "spec": { + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "raw_github_probe_bytes_per_second{target=\"raw.githubusercontent.com\",node=~\"$node\"}", + "legendFormat": "{{node}} ip={{remote_ip}}", + "range": true + }, + "version": "v0" + }, + "refId": "A" + } + } + ], + "queryOptions": {}, + "transformations": [] + } + }, + "description": "raw_github_probe_bytes_per_second for the Fastly target, faceted by remote_ip.", + "id": 13, + "links": [], + "title": "H1: raw.githubusercontent.com — throughput by Fastly IP", + "vizConfig": { + "group": "timeseries", + "kind": "VizConfig", + "spec": { + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", + "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", "showValues": false, "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "min": 0, + "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] }, + "unit": "Bps" + }, + "overrides": [] + }, + "options": { + "annotations": { "clustering": -1, "multiLane": false }, + "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } + } + }, + "version": "13.0.1" + } + } + }, + "panel-14": { + "kind": "Panel", + "spec": { + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "runner_dns_resolved_ip{host=\"raw.githubusercontent.com\",node=~\"$node\"}", + "legendFormat": "{{node}} ip={{ip}}", + "range": true + }, + "version": "v0" + }, + "refId": "A" + } + } + ], + "queryOptions": {}, + "transformations": [] + } + }, + "description": "runner_dns_resolved_ip — info metric (1 per resolved IP). Shows which Fastly IPs each node sees over time. Cache-region flips show up as the set of IP lines changing.", + "id": 14, + "links": [], + "title": "H1: DNS resolutions for raw.githubusercontent.com", + "vizConfig": { + "group": "timeseries", + "kind": "VizConfig", + "spec": { + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", + "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 30, "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "stepAfter", "lineWidth": 1, "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", "showValues": false, "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "min": 0, + "max": 1, + "thresholds": { "mode": "absolute", "steps": [ { "color": "blue", "value": 0 } ] }, + "unit": "short" + }, + "overrides": [] + }, + "options": { + "annotations": { "clustering": -1, "multiLane": false }, + "legend": { "calcs": ["lastNotNull"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } + } + }, + "version": "13.0.1" + } + } + }, + "panel-15": { + "kind": "Panel", + "spec": { + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "raw_github_probe_seconds{target=\"raw.githubusercontent.com\",node=~\"$node\"}", + "legendFormat": "{{node}} fastly", + "range": true + }, + "version": "v0" + }, + "refId": "A" + } + }, + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "raw_github_probe_seconds{target=\"cloudflare-control\",node=~\"$node\"}", + "legendFormat": "{{node}} cloudflare", + "range": true + }, + "version": "v0" + }, + "refId": "B" + } + } + ], + "queryOptions": {}, + "transformations": [] + } + }, + "description": "raw_github_probe_seconds for both targets. When Fastly sags but Cloudflare stays flat → H9 (path-specific). When both sag together → H5 (Scaleway WAN egress).", + "id": 15, + "links": [], + "title": "H5: Probe download time — Fastly vs Cloudflare", + "vizConfig": { + "group": "timeseries", + "kind": "VizConfig", + "spec": { + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", + "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", "showValues": false, "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "min": 0, + "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] }, + "unit": "s" + }, + "overrides": [] + }, + "options": { + "annotations": { "clustering": -1, "multiLane": false }, + "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } + } + }, + "version": "13.0.1" + } + } + }, + "panel-16": { + "kind": "Panel", + "spec": { + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "raw_github_probe_bytes_per_second{target=\"raw.githubusercontent.com\",node=~\"$node\"}", + "legendFormat": "{{node}} fastly", + "range": true + }, + "version": "v0" + }, + "refId": "A" + } + }, + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "raw_github_probe_bytes_per_second{target=\"cloudflare-control\",node=~\"$node\"}", + "legendFormat": "{{node}} cloudflare", + "range": true + }, + "version": "v0" + }, + "refId": "B" + } + } + ], + "queryOptions": {}, + "transformations": [] + } + }, + "description": "raw_github_probe_bytes_per_second for both targets. Same correlation logic as the time panel.", + "id": 16, + "links": [], + "title": "H5: Probe throughput — Fastly vs Cloudflare", + "vizConfig": { + "group": "timeseries", + "kind": "VizConfig", + "spec": { + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", + "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", "showValues": false, "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "min": 0, + "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] }, + "unit": "Bps" + }, + "overrides": [] + }, + "options": { + "annotations": { "clustering": -1, "multiLane": false }, + "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } + } + }, + "version": "13.0.1" + } + } + }, + "panel-17": { + "kind": "Panel", + "spec": { + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "rate(node_softirqs_functions_total{type=\"NET_TX\",node=~\"$node\"}[1m])", + "legendFormat": "{{node}} cpu={{cpu}}", + "range": true + }, + "version": "v0" + }, + "refId": "A" + } + } + ], + "queryOptions": {}, + "transformations": [] + } + }, + "description": "rate(node_softirqs_functions_total{type=\"NET_TX\"}). TX-side softirq pressure per CPU; pair with the NET_RX panel for the full softirq picture on the link.", + "id": 17, + "links": [], + "title": "H10: NET_TX softirq deliveries/sec per CPU", + "vizConfig": { + "group": "timeseries", + "kind": "VizConfig", + "spec": { + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", + "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", "showValues": false, "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] }, + "unit": "ops" + }, + "overrides": [] + }, + "options": { + "annotations": { "clustering": -1, "multiLane": false }, + "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } + } + }, + "version": "13.0.1" + } + } + }, + "panel-18": { + "kind": "Panel", + "spec": { + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "rate(node_netstat_TcpExt_TCPSpuriousRTOs{node=~\"$node\"}[5m]) / clamp_min(rate(node_netstat_TcpExt_TCPTimeouts{node=~\"$node\"}[5m]), 1)", + "legendFormat": "{{node}}", + "range": true + }, + "version": "v0" + }, + "refId": "A" + } + } + ], + "queryOptions": {}, + "transformations": [] + } + }, + "description": "TCPSpuriousRTOs / TCPTimeouts. Close to 1.0 = most RTOs were spurious (network was fine, kernel was impatient — RTO-min tuning territory). Close to 0 = real loss.", + "id": 18, + "links": [], + "title": "H9: Spurious-RTO ratio (spurious / total RTO)", + "vizConfig": { + "group": "timeseries", + "kind": "VizConfig", + "spec": { + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", + "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", "showValues": false, "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "min": 0, + "max": 1, + "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] }, + "unit": "percentunit" + }, + "overrides": [] + }, + "options": { + "annotations": { "clustering": -1, "multiLane": false }, + "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "list", "placement": "bottom", "showLegend": true }, + "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } + } + }, + "version": "13.0.1" + } + } + }, + "panel-19": { + "kind": "Panel", + "spec": { + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "rate(node_netstat_TcpExt_TCPSackRecovery{node=~\"$node\"}[5m])", + "legendFormat": "{{node}} sack-recovery", + "range": true + }, + "version": "v0" + }, + "refId": "A" + } + }, + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "rate(node_netstat_TcpExt_TCPRenoRecovery{node=~\"$node\"}[5m])", + "legendFormat": "{{node}} reno-recovery", + "range": true + }, + "version": "v0" + }, + "refId": "B" + } + }, + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "rate(node_netstat_TcpExt_TCPTimeouts{node=~\"$node\"}[5m])", + "legendFormat": "{{node}} rto", + "range": true + }, + "version": "v0" + }, + "refId": "C" + } + } + ], + "queryOptions": {}, + "transformations": [] + } + }, + "description": "TCPSackRecovery + TCPRenoRecovery (fast recovery, cwnd preserved) vs TCPTimeouts (RTO, cwnd collapsed). Healthy: fast dominates. Degraded: RTO dominates → severe / bursty loss.", + "id": 19, + "links": [], + "title": "H9: Fast recovery vs RTO", + "vizConfig": { + "group": "timeseries", + "kind": "VizConfig", + "spec": { + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", + "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", "showValues": false, "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "min": 0, + "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] }, + "unit": "ops" + }, + "overrides": [] + }, + "options": { + "annotations": { "clustering": -1, "multiLane": false }, + "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } + } + }, + "version": "13.0.1" + } + } + }, + "panel-20": { + "kind": "Panel", + "spec": { + "data": { + "kind": "QueryGroup", + "spec": { + "queries": [ + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "rate(node_softnet_dropped_total{node=~\"$node\"}[1m])", + "legendFormat": "{{node}} cpu={{cpu}} dropped", + "range": true + }, + "version": "v0" + }, + "refId": "A" + } + }, + { + "kind": "PanelQuery", + "spec": { + "hidden": false, + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "editorMode": "code", + "expr": "rate(node_softnet_times_squeezed_total{node=~\"$node\"}[1m])", + "legendFormat": "{{node}} cpu={{cpu}} squeezed", + "range": true + }, + "version": "v0" + }, + "refId": "B" + } + } + ], + "queryOptions": {}, + "transformations": [] + } + }, + "description": "node_softnet_dropped_total and _times_squeezed_total per CPU. squeezed > 0 on the CPU handling end0 IRQs = NAPI poll exhausted its budget; the NIC then drops at the ring buffer before TCP sees it. Connects H10 (softirq saturation) to H9 (downstream TCP timeouts).", + "id": 20, + "links": [], + "title": "H10: softnet drops & NAPI squeezes per CPU", + "vizConfig": { + "group": "timeseries", + "kind": "VizConfig", + "spec": { + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", + "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", + "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, + "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", "showValues": false, "spanNulls": false, + "stacking": { "group": "A", "mode": "none" }, + "thresholdsStyle": { "mode": "off" } + }, + "min": 0, + "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": 0 } ] }, + "unit": "ops" + }, + "overrides": [] + }, + "options": { + "annotations": { "clustering": -1, "multiLane": false }, + "legend": { "calcs": ["lastNotNull", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "hideZeros": false, "mode": "multi", "sort": "none" } + } + }, + "version": "13.0.1" + } + } + } + }, + "layout": { + "kind": "RowsLayout", + "spec": { + "rows": [ + { + "kind": "RowsLayoutRow", + "spec": { + "collapse": false, + "title": "H1 — Bad Fastly POP / IPv6 path", + "layout": { + "kind": "AutoGridLayout", + "spec": { + "columnWidthMode": "standard", + "items": [ + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-12" } } }, + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-13" } } }, + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-14" } } } + ], + "maxColumnCount": 3, + "rowHeightMode": "standard" + } + } + } + }, + { + "kind": "RowsLayoutRow", + "spec": { + "collapse": false, + "title": "H5 — Scaleway WAN egress contention", + "layout": { + "kind": "AutoGridLayout", + "spec": { + "columnWidthMode": "standard", + "items": [ + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-15" } } }, + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-16" } } } + ], + "maxColumnCount": 3, + "rowHeightMode": "standard" + } + } + } + }, + { + "kind": "RowsLayoutRow", + "spec": { + "collapse": false, + "title": "H7 — EEE / LPI micro-stalls (fallback)", + "layout": { + "kind": "AutoGridLayout", + "spec": { + "columnWidthMode": "standard", + "items": [ + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-10" } } } + ], + "maxColumnCount": 3, + "rowHeightMode": "standard" + } + } + } + }, + { + "kind": "RowsLayoutRow", + "spec": { + "collapse": false, + "title": "H9 — TCP-stack health (Scaleway↔Fastly path)", + "layout": { + "kind": "AutoGridLayout", + "spec": { + "columnWidthMode": "standard", + "items": [ + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-4" } } }, + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-5" } } }, + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-6" } } }, + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-18" } } }, + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-19" } } } + ], + "maxColumnCount": 3, + "rowHeightMode": "standard" + } + } + } + }, + { + "kind": "RowsLayoutRow", + "spec": { + "collapse": false, + "title": "H10 — Single-core CPU0 softirq saturation", + "layout": { + "kind": "AutoGridLayout", + "spec": { + "columnWidthMode": "standard", + "items": [ + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-1" } } }, + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-2" } } }, + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-17" } } }, + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-3" } } }, + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-20" } } } + ], + "maxColumnCount": 3, + "rowHeightMode": "standard" + } + } + } + }, + { + "kind": "RowsLayoutRow", + "spec": { + "collapse": false, + "title": "Sentinels (H2 / H3 refuted, monitoring)", + "layout": { + "kind": "AutoGridLayout", + "spec": { + "columnWidthMode": "standard", + "items": [ + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-7" } } }, + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-8" } } }, + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-11" } } }, + { "kind": "AutoGridLayoutItem", "spec": { "element": { "kind": "ElementReference", "name": "panel-9" } } } + ], + "maxColumnCount": 3, + "rowHeightMode": "standard" + } + } + } + } + ] + } + }, + "links": [], + "liveNow": false, + "preferences": { + "layout": { + "kind": "AutoGridLayout", + "spec": { "columnWidthMode": "standard", "items": [], "maxColumnCount": 3, "rowHeightMode": "standard" } + } + }, + "preload": false, + "tags": ["rise", "riscv-runner", "network"], + "timeSettings": { + "autoRefresh": "30s", + "autoRefreshIntervals": ["5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d"], + "fiscalYearStartMonth": 0, + "from": "now-1h", + "hideTimepicker": false, + "timezone": "browser", + "to": "now" + }, + "title": "RISE RISC-V runner — network health", + "variables": [ + { + "kind": "QueryVariable", + "spec": { + "allowCustomValue": false, + "current": { "text": "All", "value": ["$__all"] }, + "definition": "label_values(node)", + "hide": "dontHide", + "includeAll": true, + "label": "Node", + "multi": true, + "name": "node", + "options": [], + "query": { + "datasource": { "name": "fflnugavx2h34c" }, + "group": "prometheus", + "kind": "DataQuery", + "spec": { + "qryType": 1, + "query": "label_values(node)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "version": "v0" + }, + "refresh": "onDashboardLoad", + "regex": "", + "regexApplyTo": "value", + "skipUrlSync": false, + "sort": "alphabeticalAsc" + } + } + ] +} diff --git a/scripts/probes/dns_probe.py b/scripts/probes/dns_probe.py new file mode 100644 index 0000000..62d0a5d --- /dev/null +++ b/scripts/probes/dns_probe.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python3 +"""Snapshot which IPs raw.githubusercontent.com currently resolves to, for +the node_exporter textfile collector. + +Resolves via socket.getaddrinfo (the libc resolver curl uses), so the IPs +we record match the customer's real traffic path. Used to correlate slow +windows with Fastly cache-region or POP flips (H1). +""" + +import os +import socket +import tempfile +from pathlib import Path + +OUT = Path("/var/lib/node_exporter/textfile_collector/dns_probe.prom") +HOST = "raw.githubusercontent.com" + +HEADER = """\ +# HELP runner_dns_resolved_ip Info metric: 1 per IP currently resolved for HOST. +# TYPE runner_dns_resolved_ip gauge +# HELP runner_dns_resolved_ip_count Number of IPs returned for HOST. +# TYPE runner_dns_resolved_ip_count gauge +""" + + +def resolve(host: str) -> list[str]: + ips: set[str] = set() + for family in (socket.AF_INET, socket.AF_INET6): + try: + ips.update(info[4][0] for info in socket.getaddrinfo(host, None, family, socket.SOCK_STREAM)) + except socket.gaierror: + pass + return sorted(ips) + + +def write_atomic(path: Path, content: str) -> None: + """Write content via tempfile + os.replace so node_exporter never sees a half-written file.""" + path.parent.mkdir(parents=True, exist_ok=True) + fd, tmp = tempfile.mkstemp(dir=path.parent, prefix="." + path.name + ".") + try: + with os.fdopen(fd, "w") as f: + f.write(content) + os.replace(tmp, path) + except BaseException: + Path(tmp).unlink(missing_ok=True) + raise + + +def main() -> None: + ips = resolve(HOST) + body = HEADER + "".join( + f'runner_dns_resolved_ip{{host="{HOST}",ip="{ip}"}} 1\n' for ip in ips + ) + f'runner_dns_resolved_ip_count{{host="{HOST}"}} {len(ips)}\n' + write_atomic(OUT, body) + + +if __name__ == "__main__": + main() diff --git a/scripts/probes/raw_github_probe.py b/scripts/probes/raw_github_probe.py new file mode 100644 index 0000000..246d2d3 --- /dev/null +++ b/scripts/probes/raw_github_probe.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +"""Probe raw.githubusercontent.com download speed for the node_exporter +textfile collector. + +Hits the Fastly target the customer's CI actually downloads from and a +non-Fastly comparison endpoint. When the Fastly target sags but the +control stays flat → H9 (Scaleway↔Fastly path). When both sag → H5 +(Scaleway WAN egress). + +curl is shelled out so we get %{remote_ip} reflecting the IP libcurl +actually connected to — matches the customer's real traffic path more +faithfully than socket.gethostbyname would. +""" + +import os +import subprocess +import tempfile +from pathlib import Path + +OUT = Path("/var/lib/node_exporter/textfile_collector/raw_github_probe.prom") + +TARGETS = [ + ("raw.githubusercontent.com", + "https://raw.githubusercontent.com/usnistgov/ACVP-Server/master/README.md"), + ("cloudflare-control", + "https://speed.cloudflare.com/__down?bytes=1048576"), +] + +CURL_FORMAT = "%{time_total} %{speed_download} %{remote_ip} %{exitcode}\n" + +HEADER = """\ +# HELP raw_github_probe_seconds Wallclock to download a fixed test artefact. +# TYPE raw_github_probe_seconds gauge +# HELP raw_github_probe_bytes_per_second Average download throughput in bytes/sec. +# TYPE raw_github_probe_bytes_per_second gauge +# HELP raw_github_probe_curl_exit_code Curl exit code; 0 on success. +# TYPE raw_github_probe_curl_exit_code gauge +""" + + +def probe(target: str, url: str) -> str: + """Run one curl, return Prom-formatted lines for the result.""" + try: + result = subprocess.run( + ["curl", "-o", "/dev/null", "-s", "--max-time", "30", "-w", CURL_FORMAT, url], + capture_output=True, text=True, timeout=35, + ) + fields = (result.stdout.strip() or "0 0 unknown 99").split() + except (subprocess.TimeoutExpired, FileNotFoundError): + fields = ["0", "0", "unknown", "99"] + seconds, bps, ip, code = (fields + ["0", "0", "unknown", "99"])[:4] + return ( + f'raw_github_probe_seconds{{target="{target}",remote_ip="{ip}"}} {seconds}\n' + f'raw_github_probe_bytes_per_second{{target="{target}"}} {bps}\n' + f'raw_github_probe_curl_exit_code{{target="{target}"}} {code}\n' + ) + + +def write_atomic(path: Path, content: str) -> None: + """Write content via tempfile + os.replace so node_exporter never sees a half-written file.""" + path.parent.mkdir(parents=True, exist_ok=True) + fd, tmp = tempfile.mkstemp(dir=path.parent, prefix="." + path.name + ".") + try: + with os.fdopen(fd, "w") as f: + f.write(content) + os.replace(tmp, path) + except BaseException: + Path(tmp).unlink(missing_ok=True) + raise + + +def main() -> None: + write_atomic(OUT, HEADER + "".join(probe(name, url) for name, url in TARGETS)) + + +if __name__ == "__main__": + main() diff --git a/scripts/scw.py b/scripts/scw.py index 34f3567..62a1f45 100644 --- a/scripts/scw.py +++ b/scripts/scw.py @@ -864,6 +864,84 @@ def is_ready(res): sudo systemctl daemon-reload sudo systemctl enable prometheus-agent +############################################################################### +## Install probe scripts (textfile collector) + +# Sources live in scripts/probes/; substituted in by run_setup(). + +cat <<'PROBE_EOF' | sudo tee /usr/local/bin/raw_github_probe.py >/dev/null +@@RAW_GITHUB_PROBE_PY@@ +PROBE_EOF +sudo chmod 0755 /usr/local/bin/raw_github_probe.py +sudo chown root:root /usr/local/bin/raw_github_probe.py + +cat <<'PROBE_EOF' | sudo tee /usr/local/bin/dns_probe.py >/dev/null +@@DNS_PROBE_PY@@ +PROBE_EOF +sudo chmod 0755 /usr/local/bin/dns_probe.py +sudo chown root:root /usr/local/bin/dns_probe.py + +# Systemd timers running each probe as the node_exporter user. +cat <<'EOF' | sudo tee /etc/systemd/system/raw-github-probe.service +[Unit] +Description=Synthetic probe of raw.githubusercontent.com and a comparison target +After=network-online.target +Wants=network-online.target +[Service] +Type=oneshot +User=node_exporter +Group=node_exporter +ExecStart=/usr/local/bin/raw_github_probe.py +NoNewPrivileges=true +ProtectSystem=strict +ProtectHome=true +PrivateTmp=true +ReadWritePaths=/var/lib/node_exporter +EOF + +cat <<'EOF' | sudo tee /etc/systemd/system/raw-github-probe.timer +[Unit] +Description=Run raw-github-probe every 5 minutes +[Timer] +OnBootSec=1min +OnUnitActiveSec=5min +Unit=raw-github-probe.service +[Install] +WantedBy=timers.target +EOF + +cat <<'EOF' | sudo tee /etc/systemd/system/dns-probe.service +[Unit] +Description=DNS resolution snapshot for raw.githubusercontent.com +After=network-online.target +Wants=network-online.target +[Service] +Type=oneshot +User=node_exporter +Group=node_exporter +ExecStart=/usr/local/bin/dns_probe.py +NoNewPrivileges=true +ProtectSystem=strict +ProtectHome=true +PrivateTmp=true +ReadWritePaths=/var/lib/node_exporter +EOF + +cat <<'EOF' | sudo tee /etc/systemd/system/dns-probe.timer +[Unit] +Description=Run dns-probe every 60 seconds +[Timer] +OnBootSec=30s +OnUnitActiveSec=60s +Unit=dns-probe.service +[Install] +WantedBy=timers.target +EOF + +sudo systemctl daemon-reload +sudo systemctl enable raw-github-probe.timer +sudo systemctl enable dns-probe.timer + ############################################################################### ## Install containerd @@ -1053,8 +1131,13 @@ def create_cockpit_metrics_push_token(name: str) -> Token: def setup_runner(ssh, runner, pn): cockpit_metrics_ds = get_or_create_cockpit_metrics_data_source() cockpit_metrics_token = create_cockpit_metrics_push_token(f"{runner}-metrics-token") + probes_dir = os.path.join(os.path.dirname(__file__), "probes") + raw_github_probe_py = open(os.path.join(probes_dir, "raw_github_probe.py")).read() + dns_probe_py = open(os.path.join(probes_dir, "dns_probe.py")).read() script = SETUP_SCRIPT.replace("@@COCKPIT_METRICS_PUSH_URL@@", cockpit_metrics_ds.url) \ .replace("@@COCKPIT_METRICS_TOKEN@@", cockpit_metrics_token.secret_key) \ + .replace("@@RAW_GITHUB_PROBE_PY@@", raw_github_probe_py) \ + .replace("@@DNS_PROBE_PY@@", dns_probe_py) \ #FIXME(pn): enable private address again # .replace("@@PN_IP@@", pn.ip) # .replace("@@PN_VLAN_ID@@", pn.vlan_id)