Skip to content

Commit d730828

Browse files
authored
Merge pull request #8 from rhythmictech/ENGB360-22
Update monitors
2 parents 4aa8a64 + 462f2e1 commit d730828

File tree

18 files changed

+156
-58
lines changed

18 files changed

+156
-58
lines changed

aws/elasticsearch/variables.tf

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -89,13 +89,13 @@ variable "cpu_utilization_evaluation_window" {
8989
}
9090

9191
variable "cpu_utilization_threshold_critical" {
92-
default = 0.90
92+
default = 90
9393
description = "Critical threshold (percent)"
9494
type = number
9595
}
9696

9797
variable "cpu_utilization_threshold_warning" {
98-
default = 0.80
98+
default = 80
9999
description = "Warning threshold (percent)"
100100
type = number
101101
}

aws/rds/variables.tf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ variable "connection_count_anomaly_enabled" {
2323
}
2424

2525
variable "connection_count_anomaly_evaluation_window" {
26-
default = "last_1h"
26+
default = "last_4h"
2727
description = "Evaluation window for monitor (`last_?m` (1, 5, 10, 15, or 30), `last_?h` (1, 2, or 4), or `last_1d`]"
2828
type = string
2929
}

host/agent/main.tf

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,32 +4,34 @@ locals {
44
monitor_warn_default_priority = null
55
monitor_nodata_default_priority = null
66

7-
title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}[${var.env}] "
7+
title_prefix = var.title_prefix == null ? "" : "[${var.title_prefix}]"
88
title_suffix = var.title_suffix == null ? "" : " (${var.title_suffix})"
99
}
1010

1111
resource "datadog_monitor" "host_unreachable" {
1212
count = var.host_unreachable_enabled ? 1 : 0
1313

14-
name = join("", [local.title_prefix, "Host Unreachable - {{host.name}}", local.title_suffix])
15-
message = local.query_alert_base_message
14+
name = join("", [local.title_prefix, "Datadog Agent Status - {{name.name}}", local.title_suffix])
15+
include_tags = false
16+
message = var.host_unreachable_use_message ? local.query_alert_base_message : ""
1617
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
1718
type = "service check"
1819

1920
evaluation_delay = var.evaluation_delay
2021
new_group_delay = var.new_group_delay
21-
notify_no_data = var.notify_no_data
22+
no_data_timeframe = "5"
23+
notify_no_data = true
2224
renotify_interval = var.renotify_interval
2325
require_full_window = true
2426
timeout_h = var.timeout_h
2527

2628
query = <<EOQ
27-
"datadog.agent.up"${local.service_filter}.by("host").last(6).count_by_status()
29+
"datadog.agent.up"${local.service_filter}.by("name","aws_account","env","datadog_managed").last(2).count_by_status()
2830
EOQ
2931

3032
monitor_thresholds {
3133
ok = 1
3234
warning = 1
33-
critical = 5
35+
critical = 1
3436
}
3537
}

host/agent/variables.tf

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,9 @@ variable "host_unreachable_enabled" {
2121
description = "Flag to enable Host unreachable monitor"
2222
type = bool
2323
}
24+
25+
variable "host_unreachable_use_message" {
26+
default = true
27+
description = "Flag to enable Host unreachable alerting"
28+
type = bool
29+
}

host/clock/main.tf

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,27 +4,28 @@ locals {
44
monitor_warn_default_priority = null
55
monitor_nodata_default_priority = null
66

7-
title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}[${var.env}] "
7+
title_prefix = var.title_prefix == null ? "" : "[${var.title_prefix}]"
88
title_suffix = var.title_suffix == null ? "" : " (${var.title_suffix})"
99
}
1010

1111
resource "datadog_monitor" "system_clock" {
1212
count = var.system_clock_enabled ? 1 : 0
1313

14-
name = join("", [local.title_prefix, "System Clock - {{host.name}}", local.title_suffix])
15-
message = local.query_alert_base_message
14+
name = join("", [local.title_prefix, "System Clock - {{name.name}}", local.title_suffix])
15+
include_tags = false
16+
message = var.system_clock_use_message ? local.query_alert_base_message : ""
1617
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
1718
type = "service check"
1819

1920
evaluation_delay = var.evaluation_delay
2021
new_group_delay = var.new_group_delay
21-
notify_no_data = var.notify_no_data
22+
notify_no_data = false
2223
renotify_interval = var.renotify_interval
2324
require_full_window = true
2425
timeout_h = var.timeout_h
2526

2627
query = <<EOQ
27-
"ntp.in_sync"${local.service_filter}.by("host").last(6).count_by_status()
28+
"ntp.in_sync"${local.service_filter}.by("name","aws_account","env","datadog_managed").last(6).count_by_status()
2829
EOQ
2930

3031
monitor_thresholds {

host/clock/variables.tf

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,9 @@ variable "system_clock_enabled" {
2121
description = "Flag to enable Host unreachable monitor"
2222
type = bool
2323
}
24+
25+
variable "system_clock_use_message" {
26+
default = false
27+
description = "Flag to enable Host unreachable alerting"
28+
type = bool
29+
}

host/cpu/main.tf

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,29 +4,31 @@ locals {
44
monitor_warn_default_priority = null
55
monitor_nodata_default_priority = null
66

7-
title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}[${var.env}] "
7+
title_prefix = var.title_prefix == null ? "" : "[${var.title_prefix}]"
88
title_suffix = var.title_suffix == null ? "" : " (${var.title_suffix})"
99
}
1010

1111
resource "datadog_monitor" "cpu_utilization" {
1212
count = var.cpu_utilization_enabled ? 1 : 0
1313

14-
name = join("", [local.title_prefix, "CPU Utilization - {{host.name}}", local.title_suffix])
15-
message = local.query_alert_base_message
14+
name = join("", [local.title_prefix, "CPU Utilization - {{name.name}}", local.title_suffix])
15+
message = var.cpu_utilization_use_message ? local.query_alert_base_message : ""
1616
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
1717
type = "query alert"
1818

1919
evaluation_delay = var.evaluation_delay
2020
new_group_delay = var.new_group_delay
21-
notify_no_data = var.notify_no_data
21+
notify_no_data = false
2222
no_data_timeframe = var.cpu_utilization_no_data_window
2323
renotify_interval = var.renotify_interval
2424
require_full_window = true
2525
timeout_h = var.timeout_h
26+
include_tags = false
27+
2628

2729
query = <<EOQ
2830
${var.cpu_utilization_time_aggregator}(${var.cpu_utilization_timeframe}): (
29-
100 - avg:system.cpu.idle${local.query_filter} by {host}
31+
100 - avg:system.cpu.idle${local.query_filter} by {name,aws_account,env,datadog_managed}
3032
) > ${var.cpu_utilization_threshold_critical}
3133
EOQ
3234

host/cpu/variables.tf

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,3 +51,9 @@ variable "cpu_utilization_threshold_warning" {
5151
description = "Warning threshold (percent)"
5252
type = number
5353
}
54+
55+
variable "cpu_utilization_use_message" {
56+
default = false
57+
description = "Flag to enable CPU Utilitzation alerting"
58+
type = bool
59+
}

host/disk/main.tf

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,15 @@ locals {
44
monitor_warn_default_priority = null
55
monitor_nodata_default_priority = null
66

7-
title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}[${var.env}] "
7+
title_prefix = var.title_prefix == null ? "" : "[${var.title_prefix}]"
88
title_suffix = var.title_suffix == null ? "" : " (${var.title_suffix})"
99
}
1010

1111
resource "datadog_monitor" "disk_space" {
1212
count = var.disk_space_enabled ? 1 : 0
1313

14-
name = join("", [local.title_prefix, "Disk Space - {{host.name}}", local.title_suffix])
15-
message = local.query_alert_base_message
14+
name = join("", [local.title_prefix, "Disk Space - {{name.name}}", local.title_suffix])
15+
message = var.disk_space_use_message ? local.query_alert_base_message : ""
1616
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
1717
type = "query alert"
1818

@@ -26,7 +26,7 @@ resource "datadog_monitor" "disk_space" {
2626

2727
query = <<EOQ
2828
${var.disk_space_time_aggregator}(${var.disk_space_timeframe}):
29-
avg:system.disk.in_use${local.query_filter} by {host,device}
29+
avg:system.disk.in_use${local.query_filter} by {name,aws_account,device,env,datadog_managed}
3030
* 100 > ${var.disk_space_threshold_critical}
3131
EOQ
3232

@@ -39,23 +39,23 @@ resource "datadog_monitor" "disk_space" {
3939
resource "datadog_monitor" "disk_space_forecast" {
4040
count = var.disk_space_forecast_enabled ? 1 : 0
4141

42-
name = join("", [local.title_prefix, "Disk Space Forecast - {{host.name}}", local.title_suffix])
43-
message = local.query_alert_base_message
42+
name = join("", [local.title_prefix, "Disk Space Forecast - {{name.name}}", local.title_suffix])
43+
include_tags = false
44+
message = var.disk_space_forecast_use_message ? local.query_alert_base_message : ""
4445
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
4546
type = "query alert"
4647

4748
evaluation_delay = var.evaluation_delay
4849
new_group_delay = var.new_group_delay
4950
notify_audit = false
5051
timeout_h = var.timeout_h
51-
include_tags = true
5252
require_full_window = true
5353
notify_no_data = false
5454
renotify_interval = 0
5555

5656
query = <<EOQ
5757
${var.disk_space_forecast_time_aggregator}(${var.disk_space_forecast_timeframe}):
58-
forecast(avg:system.disk.in_use${local.query_filter} by {host,device} * 100,
58+
forecast(avg:system.disk.in_use${local.query_filter} by {name,aws_account,device,env,datadog_managed} * 100,
5959
'${var.disk_space_forecast_algorithm}',
6060
${var.disk_space_forecast_deviations},
6161
interval='${var.disk_space_forecast_interval}',
@@ -74,14 +74,15 @@ resource "datadog_monitor" "disk_space_forecast" {
7474
resource "datadog_monitor" "disk_inodes" {
7575
count = var.disk_inodes_enabled ? 1 : 0
7676

77-
name = join("", [local.title_prefix, "Disk Inodes Usage - {{host.name}}", local.title_suffix])
78-
message = local.query_alert_base_message
77+
name = join("", [local.title_prefix, "Disk Inodes Usage - {{name.name}}", local.title_suffix])
78+
include_tags = false
79+
message = var.disk_inodes_use_message ? local.query_alert_base_message : ""
7980
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
8081
type = "query alert"
8182

8283
query = <<EOQ
8384
${var.disk_inodes_time_aggregator}(${var.disk_inodes_timeframe}):
84-
avg:system.fs.inodes.in_use${local.query_filter} by {host,device}
85+
avg:system.fs.inodes.in_use${local.query_filter} by {name,aws_account,device,env,datadog_managed}
8586
* 100 > ${var.disk_inodes_threshold_critical}
8687
EOQ
8788

@@ -90,7 +91,6 @@ resource "datadog_monitor" "disk_inodes" {
9091
notify_no_data = false
9192
notify_audit = false
9293
timeout_h = var.timeout_h
93-
include_tags = true
9494
require_full_window = true
9595

9696
monitor_thresholds {

host/disk/variables.tf

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,12 @@ variable "disk_space_threshold_critical" {
4646
default = 90
4747
}
4848

49+
variable "disk_space_use_message" {
50+
description = "Flag to enable Free diskspace alerting"
51+
type = string
52+
default = "true"
53+
}
54+
4955
########################################
5056
# Disk Space Forecast
5157
########################################
@@ -115,6 +121,12 @@ variable "disk_space_forecast_threshold_critical" {
115121
default = 80
116122
}
117123

124+
variable "disk_space_forecast_use_message" {
125+
description = "Flag to enable Free diskspace forecast alerting"
126+
type = string
127+
default = "false"
128+
}
129+
118130
########################################
119131
# Disk Inodes
120132
########################################
@@ -147,3 +159,9 @@ variable "disk_inodes_threshold_critical" {
147159
type = number
148160
default = 95
149161
}
162+
163+
variable "disk_inodes_use_message" {
164+
description = "Flag to enable Free disk inodes alerting"
165+
type = string
166+
default = "true"
167+
}

host/memory/main.tf

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,22 +4,23 @@ locals {
44
monitor_warn_default_priority = null
55
monitor_nodata_default_priority = null
66

7-
title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}[${var.env}] "
7+
title_prefix = var.title_prefix == null ? "" : "[${var.title_prefix}]"
88
title_suffix = var.title_suffix == null ? "" : " (${var.title_suffix})"
99
}
1010

1111
resource "datadog_monitor" "memory" {
1212
count = var.memory_enabled ? 1 : 0
1313

14-
name = join("", [local.title_prefix, "Usable Memory - {{host.name}}", local.title_suffix])
15-
message = local.query_alert_base_message
14+
name = join("", [local.title_prefix, "Usable Memory - {{name.name}}", local.title_suffix])
15+
include_tags = false
16+
message = var.memory_use_message ? local.query_alert_base_message : ""
1617
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
1718
type = "query alert"
1819

1920
query = <<EOQ
2021
${var.memory_time_aggregator}(${var.memory_timeframe}):
21-
avg:system.mem.usable${local.query_filter} by {host} /
22-
avg:system.mem.total${local.query_filter} by {host} * 100
22+
avg:system.mem.usable${local.query_filter} by {name,aws_account,env,datadog_managed} /
23+
avg:system.mem.total${local.query_filter} by {name,aws_account,env,datadog_managed} * 100
2324
< ${var.memory_threshold_critical}
2425
EOQ
2526

@@ -29,7 +30,6 @@ resource "datadog_monitor" "memory" {
2930
renotify_interval = 0
3031
notify_audit = false
3132
timeout_h = var.timeout_h
32-
include_tags = true
3333
require_full_window = true
3434

3535
monitor_thresholds {

host/memory/variables.tf

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,9 @@ variable "memory_threshold_critical" {
4545
type = number
4646
default = 5
4747
}
48+
49+
variable "memory_use_message" {
50+
description = "Flag to enable Free memory alerting"
51+
type = string
52+
default = "true"
53+
}

host/process/main.tf

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,15 @@ locals {
44
monitor_warn_default_priority = null
55
monitor_nodata_default_priority = null
66

7-
title_prefix = "${var.title_prefix == null ? "" : "[${var.title_prefix}]"}[${var.env}] "
7+
title_prefix = var.title_prefix == null ? "" : "[${var.title_prefix}]"
88
title_suffix = var.title_suffix == null ? "" : " (${var.title_suffix})"
99
}
1010

1111
resource "datadog_monitor" "process_alert" {
1212
count = var.process_alert_enabled ? 1 : 0
1313

1414
name = join("", [local.title_prefix, "Process Alert - {{host.name}}", local.title_suffix])
15-
message = local.query_alert_base_message
15+
message = var.process_alert_use_message ? local.query_alert_base_message : ""
1616
tags = concat(local.common_tags, var.base_tags, var.additional_tags)
1717
type = "process alert"
1818

@@ -21,7 +21,7 @@ resource "datadog_monitor" "process_alert" {
2121
renotify_interval = 0
2222
notify_audit = false
2323
timeout_h = var.timeout_h
24-
include_tags = true
24+
include_tags = false
2525
require_full_window = true
2626

2727
query = <<EOQ

host/process/variables.tf

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,3 +51,9 @@ variable "process_alert_operator" {
5151
type = string
5252
default = "<"
5353
}
54+
55+
variable "process_alert_use_message" {
56+
description = "Flag to enable Process Check alerting"
57+
type = string
58+
default = "true"
59+
}

0 commit comments

Comments
 (0)