Skip to content

Commit db47ce2

Browse files
authored
Adds a gmx_machine_state{}==1 condition to a number of alerts. (#307)
* Adds a gmx_machine_state{}==1 condition to any alerts that already have a lame_duck_node condition. * Adds gmx_[site, machine]_state conditions to a couple more alerts. * A few enhancements and fixes per suggestions/comments from PR #307.
1 parent 8f3583b commit db47ce2

File tree

1 file changed

+10
-10
lines changed

1 file changed

+10
-10
lines changed

config/federation/prometheus/alerts.yml

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,7 @@ ALERT CoreServices_SidestreamIsNotRunning
5151
IF sum_over_time(up{service="sidestream"}[10m]) == 0
5252
AND ON(machine)
5353
sum_over_time(probe_success{service="ssh806"}[20m]) / 20 >= 0.90
54-
UNLESS ON(machine)
55-
lame_duck_node == 1
54+
UNLESS ON(machine) (lame_duck_node == 1 OR gmx_machine_maintenance == 1)
5655
FOR 10m
5756
LABELS {
5857
severity = "page",
@@ -85,8 +84,7 @@ ALERT ScraperMostRecentArchivedFileTimeIsTooOld
8584
IF (time() - (scraper_maxrawfiletimearchived{container="scraper-sync"} != 0)) > (56 * 60 * 60)
8685
AND ON(machine)
8786
(time() - process_start_time_seconds{service="sidestream"}) > (30 * 60 * 60)
88-
UNLESS ON(machine)
89-
lame_duck_node == 1
87+
UNLESS ON(machine) (lame_duck_node == 1 OR gmx_machine_maintenance == 1)
9088
FOR 2h
9189
LABELS {
9290
severity = "page",
@@ -143,7 +141,8 @@ ALERT ScraperCollectorMissingFromScraperSync
143141
# "up"/"aliveness" check.
144142
ALERT SwitchDownAtSite
145143
IF up{job="snmp-targets", site!~".*t$"} == 0
146-
AND ON(site) probe_success{instance=~"s1.*", module="icmp"} == 0
144+
AND ON(site) probe_success{instance=~"s1.*", module="icmp"} == 0
145+
UNLESS ON(site) gmx_site_maintenance == 1
147146
FOR 24h
148147
LABELS {
149148
severity = "ticket",
@@ -404,8 +403,8 @@ ALERT BlackboxExporterIpv6DownOrMissing
404403
ALERT TooManyNdtServersDown
405404
IF count_scalar(
406405
probe_success{service="ndt_raw"} AND ON(machine)
407-
up{service="nodeexporter"} == 1 UNLESS ON(machine)
408-
lame_duck_node{} == 1
406+
up{service="nodeexporter"} == 1
407+
UNLESS ON(machine) (lame_duck_node == 1 OR gmx_machine_maintenance == 1)
409408
UNLESS ON(machine) (
410409
probe_success{service="ndt_raw"} == 1 AND ON(machine)
411410
probe_success{service="ndt_ssl"} == 1 AND ON(machine)
@@ -417,8 +416,8 @@ ALERT TooManyNdtServersDown
417416
/
418417
count(
419418
probe_success{service="ndt_raw"} AND ON(machine)
420-
up{service="nodeexporter"} == 1 UNLESS ON(machine)
421-
lame_duck_node{} == 1
419+
up{service="nodeexporter"} == 1
420+
UNLESS ON(machine) (lame_duck_node == 1 OR gmx_machine_maintenance == 1)
422421
) > 0.25
423422
FOR 30m
424423
LABELS {
@@ -492,7 +491,7 @@ ALERT MobiperfMetricsMissing
492491
# Some number of nodes don't have a lame-duck status.
493492
ALERT LameDuckMetricMissingForNode
494493
IF up{service="nodeexporter"} == 1
495-
UNLESS ON(machine) lame_duck_node{}
494+
UNLESS ON(machine) (lame_duck_node == 1 OR gmx_machine_maintenance == 1)
496495
FOR 30m
497496
LABELS {
498497
severity = "ticket",
@@ -523,6 +522,7 @@ ALERT VdlimitMetricsMissingForNode
523522
# A collectd-mlab service has a problem and is down.
524523
ALERT CoreServices_CollectdMlabDown
525524
IF collectd_mlab_success{} == 0
525+
UNLESS ON(machine) gmx_machine_maintenance == 1
526526
FOR 10m
527527
LABELS {
528528
severity = "ticket",

0 commit comments

Comments
 (0)