@@ -51,8 +51,7 @@ ALERT CoreServices_SidestreamIsNotRunning
51
51
IF sum_over_time(up{service="sidestream"}[10m]) == 0
52
52
AND ON(machine)
53
53
sum_over_time(probe_success{service="ssh806"}[20m]) / 20 >= 0.90
54
- UNLESS ON(machine)
55
- lame_duck_node == 1
54
+ UNLESS ON(machine) (lame_duck_node == 1 OR gmx_machine_maintenance == 1)
56
55
FOR 10m
57
56
LABELS {
58
57
severity = "page",
@@ -85,8 +84,7 @@ ALERT ScraperMostRecentArchivedFileTimeIsTooOld
85
84
IF (time() - (scraper_maxrawfiletimearchived{container="scraper-sync"} != 0)) > (56 * 60 * 60)
86
85
AND ON(machine)
87
86
(time() - process_start_time_seconds{service="sidestream"}) > (30 * 60 * 60)
88
- UNLESS ON(machine)
89
- lame_duck_node == 1
87
+ UNLESS ON(machine) (lame_duck_node == 1 OR gmx_machine_maintenance == 1)
90
88
FOR 2h
91
89
LABELS {
92
90
severity = "page",
@@ -143,7 +141,8 @@ ALERT ScraperCollectorMissingFromScraperSync
143
141
# "up"/"aliveness" check.
144
142
ALERT SwitchDownAtSite
145
143
IF up{job="snmp-targets", site!~".*t$"} == 0
146
- AND ON(site) probe_success{instance=~"s1.*", module="icmp"} == 0
144
+ AND ON(site) probe_success{instance=~"s1.*", module="icmp"} == 0
145
+ UNLESS ON(site) gmx_site_maintenance == 1
147
146
FOR 24h
148
147
LABELS {
149
148
severity = "ticket",
@@ -404,8 +403,8 @@ ALERT BlackboxExporterIpv6DownOrMissing
404
403
ALERT TooManyNdtServersDown
405
404
IF count_scalar(
406
405
probe_success{service="ndt_raw"} AND ON(machine)
407
- up{service="nodeexporter"} == 1 UNLESS ON(machine)
408
- lame_duck_node{} == 1
406
+ up{service="nodeexporter"} == 1
407
+ UNLESS ON(machine) ( lame_duck_node == 1 OR gmx_machine_maintenance == 1)
409
408
UNLESS ON(machine) (
410
409
probe_success{service="ndt_raw"} == 1 AND ON(machine)
411
410
probe_success{service="ndt_ssl"} == 1 AND ON(machine)
@@ -417,8 +416,8 @@ ALERT TooManyNdtServersDown
417
416
/
418
417
count(
419
418
probe_success{service="ndt_raw"} AND ON(machine)
420
- up{service="nodeexporter"} == 1 UNLESS ON(machine)
421
- lame_duck_node{} == 1
419
+ up{service="nodeexporter"} == 1
420
+ UNLESS ON(machine) ( lame_duck_node == 1 OR gmx_machine_maintenance == 1)
422
421
) > 0.25
423
422
FOR 30m
424
423
LABELS {
@@ -492,7 +491,7 @@ ALERT MobiperfMetricsMissing
492
491
# Some number of nodes don't have a lame-duck status.
493
492
ALERT LameDuckMetricMissingForNode
494
493
IF up{service="nodeexporter"} == 1
495
- UNLESS ON(machine) lame_duck_node{}
494
+ UNLESS ON(machine) ( lame_duck_node == 1 OR gmx_machine_maintenance == 1)
496
495
FOR 30m
497
496
LABELS {
498
497
severity = "ticket",
@@ -523,6 +522,7 @@ ALERT VdlimitMetricsMissingForNode
523
522
# A collectd-mlab service has a problem and is down.
524
523
ALERT CoreServices_CollectdMlabDown
525
524
IF collectd_mlab_success{} == 0
525
+ UNLESS ON(machine) gmx_machine_maintenance == 1
526
526
FOR 10m
527
527
LABELS {
528
528
severity = "ticket",
0 commit comments