@@ -51,8 +51,7 @@ ALERT CoreServices_SidestreamIsNotRunning
5151 IF sum_over_time(up{service="sidestream"}[10m]) == 0
5252 AND ON(machine)
5353 sum_over_time(probe_success{service="ssh806"}[20m]) / 20 >= 0.90
54- UNLESS ON(machine)
55- lame_duck_node == 1
54+ UNLESS ON(machine) (lame_duck_node == 1 OR gmx_machine_maintenance == 1)
5655 FOR 10m
5756 LABELS {
5857 severity = "page",
@@ -85,8 +84,7 @@ ALERT ScraperMostRecentArchivedFileTimeIsTooOld
8584 IF (time() - (scraper_maxrawfiletimearchived{container="scraper-sync"} != 0)) > (56 * 60 * 60)
8685 AND ON(machine)
8786 (time() - process_start_time_seconds{service="sidestream"}) > (30 * 60 * 60)
88- UNLESS ON(machine)
89- lame_duck_node == 1
87+ UNLESS ON(machine) (lame_duck_node == 1 OR gmx_machine_maintenance == 1)
9088 FOR 2h
9189 LABELS {
9290 severity = "page",
@@ -143,7 +141,8 @@ ALERT ScraperCollectorMissingFromScraperSync
143141# "up"/"aliveness" check.
144142ALERT SwitchDownAtSite
145143 IF up{job="snmp-targets", site!~".*t$"} == 0
146- AND ON(site) probe_success{instance=~"s1.*", module="icmp"} == 0
144+ AND ON(site) probe_success{instance=~"s1.*", module="icmp"} == 0
145+ UNLESS ON(site) gmx_site_maintenance == 1
147146 FOR 24h
148147 LABELS {
149148 severity = "ticket",
@@ -404,8 +403,8 @@ ALERT BlackboxExporterIpv6DownOrMissing
404403ALERT TooManyNdtServersDown
405404 IF count_scalar(
406405 probe_success{service="ndt_raw"} AND ON(machine)
407- up{service="nodeexporter"} == 1 UNLESS ON(machine)
408- lame_duck_node{} == 1
406+ up{service="nodeexporter"} == 1
407+ UNLESS ON(machine) ( lame_duck_node == 1 OR gmx_machine_maintenance == 1)
409408 UNLESS ON(machine) (
410409 probe_success{service="ndt_raw"} == 1 AND ON(machine)
411410 probe_success{service="ndt_ssl"} == 1 AND ON(machine)
@@ -417,8 +416,8 @@ ALERT TooManyNdtServersDown
417416 /
418417 count(
419418 probe_success{service="ndt_raw"} AND ON(machine)
420- up{service="nodeexporter"} == 1 UNLESS ON(machine)
421- lame_duck_node{} == 1
419+ up{service="nodeexporter"} == 1
420+ UNLESS ON(machine) ( lame_duck_node == 1 OR gmx_machine_maintenance == 1)
422421 ) > 0.25
423422 FOR 30m
424423 LABELS {
@@ -492,7 +491,7 @@ ALERT MobiperfMetricsMissing
492491# Some number of nodes don't have a lame-duck status.
493492ALERT LameDuckMetricMissingForNode
494493 IF up{service="nodeexporter"} == 1
495- UNLESS ON(machine) lame_duck_node{}
494+ UNLESS ON(machine) ( lame_duck_node == 1 OR gmx_machine_maintenance == 1)
496495 FOR 30m
497496 LABELS {
498497 severity = "ticket",
@@ -523,6 +522,7 @@ ALERT VdlimitMetricsMissingForNode
523522# A collectd-mlab service has a problem and is down.
524523ALERT CoreServices_CollectdMlabDown
525524 IF collectd_mlab_success{} == 0
525+ UNLESS ON(machine) gmx_machine_maintenance == 1
526526 FOR 10m
527527 LABELS {
528528 severity = "ticket",
0 commit comments