Skip to content

Commit 05a58f7

Browse files
authored
Merge pull request #779 from grafana/hjet/aggregate_alerts
Add cluster label to aggregations in resource alerts
2 parents 7b559e8 + 5e0192e commit 05a58f7

File tree

1 file changed

+84
-27
lines changed

1 file changed

+84
-27
lines changed

alerts/resource_alerts.libsonnet

Lines changed: 84 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -24,70 +24,127 @@
2424
rules: [
2525
{
2626
alert: 'KubeCPUOvercommit',
27-
expr: |||
28-
sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) - (sum(kube_node_status_allocatable{resource="cpu"}) - max(kube_node_status_allocatable{resource="cpu"})) > 0
29-
and
30-
(sum(kube_node_status_allocatable{resource="cpu"}) - max(kube_node_status_allocatable{resource="cpu"})) > 0
31-
||| % $._config,
3227
labels: {
3328
severity: 'warning',
3429
},
3530
annotations: {
36-
description: 'Cluster has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.',
3731
summary: 'Cluster has overcommitted CPU resource requests.',
3832
},
3933
'for': '10m',
40-
},
41-
{
42-
alert: 'KubeMemoryOvercommit',
34+
} +
35+
if $._config.showMultiCluster then {
36+
expr: |||
37+
sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) - (sum(kube_node_status_allocatable{resource="cpu"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="cpu"}) by (%(clusterLabel)s)) > 0
38+
and
39+
(sum(kube_node_status_allocatable{resource="cpu"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="cpu"}) by (%(clusterLabel)s)) > 0
40+
||| % $._config,
41+
annotations+: {
42+
description: 'Cluster {{ $labels.%(clusterLabel)s }} has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.' % $._config,
43+
},
44+
} else {
4345
expr: |||
44-
sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) - (sum(kube_node_status_allocatable{resource="memory"}) - max(kube_node_status_allocatable{resource="memory"})) > 0
46+
sum(namespace_cpu:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) - (sum(kube_node_status_allocatable{resource="cpu"}) - max(kube_node_status_allocatable{resource="cpu"})) > 0
4547
and
46-
(sum(kube_node_status_allocatable{resource="memory"}) - max(kube_node_status_allocatable{resource="memory"})) > 0
48+
(sum(kube_node_status_allocatable{resource="cpu"}) - max(kube_node_status_allocatable{resource="cpu"})) > 0
4749
||| % $._config,
50+
annotations+: {
51+
description: 'Cluster has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.' % $._config,
52+
},
53+
},
54+
{
55+
alert: 'KubeMemoryOvercommit',
4856
labels: {
4957
severity: 'warning',
5058
},
5159
annotations: {
52-
description: 'Cluster has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.',
5360
summary: 'Cluster has overcommitted memory resource requests.',
5461
},
5562
'for': '10m',
56-
},
57-
{
58-
alert: 'KubeCPUQuotaOvercommit',
63+
} +
64+
if $._config.showMultiCluster then {
5965
expr: |||
60-
sum(min without(resource) (kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="hard", resource=~"(cpu|requests.cpu)"}))
61-
/
62-
sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s})
63-
> %(namespaceOvercommitFactor)s
66+
sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) by (%(clusterLabel)s) - (sum(kube_node_status_allocatable{resource="memory"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="memory"}) by (%(clusterLabel)s)) > 0
67+
and
68+
(sum(kube_node_status_allocatable{resource="memory"}) by (%(clusterLabel)s) - max(kube_node_status_allocatable{resource="memory"}) by (%(clusterLabel)s)) > 0
6469
||| % $._config,
70+
annotations+: {
71+
description: 'Cluster {{ $labels.%(clusterLabel)s }} has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.' % $._config,
72+
},
73+
} else
74+
{
75+
expr: |||
76+
sum(namespace_memory:kube_pod_container_resource_requests:sum{%(ignoringOverprovisionedWorkloadSelector)s}) - (sum(kube_node_status_allocatable{resource="memory"}) - max(kube_node_status_allocatable{resource="memory"})) > 0
77+
and
78+
(sum(kube_node_status_allocatable{resource="memory"}) - max(kube_node_status_allocatable{resource="memory"})) > 0
79+
||| % $._config,
80+
annotations+: {
81+
description: 'Cluster has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure.',
82+
},
83+
},
84+
{
85+
alert: 'KubeCPUQuotaOvercommit',
6586
labels: {
6687
severity: 'warning',
6788
},
6889
annotations: {
69-
description: 'Cluster has overcommitted CPU resource requests for Namespaces.',
7090
summary: 'Cluster has overcommitted CPU resource requests.',
7191
},
7292
'for': '5m',
73-
},
74-
{
75-
alert: 'KubeMemoryQuotaOvercommit',
93+
} +
94+
if $._config.showMultiCluster then {
7695
expr: |||
77-
sum(min without(resource) (kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="hard", resource=~"(memory|requests.memory)"}))
96+
sum(min without(resource) (kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="hard", resource=~"(cpu|requests.cpu)"})) by (%(clusterLabel)s)
7897
/
79-
sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s})
98+
sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s)
8099
> %(namespaceOvercommitFactor)s
81100
||| % $._config,
101+
annotations+: {
102+
description: 'Cluster {{ $labels.%(clusterLabel)s }} has overcommitted CPU resource requests for Namespaces.' % $._config,
103+
},
104+
} else
105+
{
106+
expr: |||
107+
sum(min without(resource) (kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="hard", resource=~"(cpu|requests.cpu)"}))
108+
/
109+
sum(kube_node_status_allocatable{resource="cpu", %(kubeStateMetricsSelector)s})
110+
> %(namespaceOvercommitFactor)s
111+
||| % $._config,
112+
annotations+: {
113+
description: 'Cluster has overcommitted CPU resource requests for Namespaces.',
114+
},
115+
},
116+
{
117+
alert: 'KubeMemoryQuotaOvercommit',
82118
labels: {
83119
severity: 'warning',
84120
},
85121
annotations: {
86-
description: 'Cluster has overcommitted memory resource requests for Namespaces.',
87122
summary: 'Cluster has overcommitted memory resource requests.',
88123
},
89124
'for': '5m',
90-
},
125+
} +
126+
if $._config.showMultiCluster then {
127+
expr: |||
128+
sum(min without(resource) (kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="hard", resource=~"(memory|requests.memory)"})) by (%(clusterLabel)s)
129+
/
130+
sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s}) by (%(clusterLabel)s)
131+
> %(namespaceOvercommitFactor)s
132+
||| % $._config,
133+
annotations+: {
134+
description: 'Cluster {{ $labels.%(clusterLabel)s }} has overcommitted memory resource requests for Namespaces.' % $._config,
135+
},
136+
} else
137+
{
138+
expr: |||
139+
sum(min without(resource) (kube_resourcequota{%(prefixedNamespaceSelector)s%(kubeStateMetricsSelector)s, type="hard", resource=~"(memory|requests.memory)"}))
140+
/
141+
sum(kube_node_status_allocatable{resource="memory", %(kubeStateMetricsSelector)s})
142+
> %(namespaceOvercommitFactor)s
143+
||| % $._config,
144+
annotations+: {
145+
description: 'Cluster has overcommitted memory resource requests for Namespaces.',
146+
},
147+
},
91148
{
92149
alert: 'KubeQuotaAlmostFull',
93150
expr: |||

0 commit comments

Comments
 (0)