File tree Expand file tree Collapse file tree 1 file changed +23
-0
lines changed
config/federation/prometheus Expand file tree Collapse file tree 1 file changed +23
-0
lines changed Original file line number Diff line number Diff line change @@ -134,6 +134,28 @@ ALERT ScraperCollectorMissingFromScraperSync
134
134
description = "",
135
135
}
136
136
137
+
138
+ # SwitchSLO
139
+ #
140
+ # A switch at a site has been down for too long and we need to contact the site
141
+ # host or transit provider to investigate. If SNMP scraping *and* pings are both
142
+ # failing for a certain period, then this is probaby a reasonable stand-in as an
143
+ # "up"/"aliveness" check.
144
+ ALERT SwitchDownAtSite
145
+ IF up{job="snmp-targets", site!~".*t$"} == 0
146
+ AND ON(site) probe_success{instance=~"s1.*", module="icmp"} == 0
147
+ FOR 24h
148
+ LABELS {
149
+ severity = "ticket",
150
+ repo = "ops-tracker"
151
+ }
152
+ ANNOTATIONS {
153
+ summary = "The switch at a site has been unreachable for too long.",
154
+ hints = "The issue could be with the switch itself, or with the transit provider.",
155
+ dashboard = "https://grafana.mlab-oti.measurementlab.net/d/SuqnZ6Hiz/?orgId=1&var-site_name={{ $labels.site }}"
156
+ }
157
+
158
+
137
159
# #
138
160
# # Inventory.
139
161
# #
@@ -303,6 +325,7 @@ ALERT SnmpExporterMissingMetrics
303
325
# Scraping SNMP metrics from a switch is failing.
304
326
ALERT SnmpScrapingDownAtSite
305
327
IF up{job="snmp-targets", site!~".*t$"} == 0
328
+ AND ON(site) probe_success{instance=~"s1.*", module="icmp"} == 1
306
329
FOR 2h
307
330
LABELS {
308
331
severity = "page",
You can’t perform that action at this time.
0 commit comments