Skip to content

Commit 82fab1b

Browse files
authored
Merge pull request #305 from m-lab/sandbox-kinkade
Adds a new SwitchDownAtSite alert
2 parents c679db9 + 448b4de commit 82fab1b

File tree

1 file changed

+23
-0
lines changed

1 file changed

+23
-0
lines changed

config/federation/prometheus/alerts.yml

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,28 @@ ALERT ScraperCollectorMissingFromScraperSync
134134
description = "",
135135
}
136136

137+
138+
# SwitchSLO
139+
#
140+
# A switch at a site has been down for too long and we need to contact the site
141+
# host or transit provider to investigate. If SNMP scraping *and* pings are both
142+
# failing for a certain period, then this is probaby a reasonable stand-in as an
143+
# "up"/"aliveness" check.
144+
ALERT SwitchDownAtSite
145+
IF up{job="snmp-targets", site!~".*t$"} == 0
146+
AND ON(site) probe_success{instance=~"s1.*", module="icmp"} == 0
147+
FOR 24h
148+
LABELS {
149+
severity = "ticket",
150+
repo = "ops-tracker"
151+
}
152+
ANNOTATIONS {
153+
summary = "The switch at a site has been unreachable for too long.",
154+
hints = "The issue could be with the switch itself, or with the transit provider.",
155+
dashboard = "https://grafana.mlab-oti.measurementlab.net/d/SuqnZ6Hiz/?orgId=1&var-site_name={{ $labels.site }}"
156+
}
157+
158+
137159
##
138160
## Inventory.
139161
##
@@ -303,6 +325,7 @@ ALERT SnmpExporterMissingMetrics
303325
# Scraping SNMP metrics from a switch is failing.
304326
ALERT SnmpScrapingDownAtSite
305327
IF up{job="snmp-targets", site!~".*t$"} == 0
328+
AND ON(site) probe_success{instance=~"s1.*", module="icmp"} == 1
306329
FOR 2h
307330
LABELS {
308331
severity = "page",

0 commit comments

Comments
 (0)