Skip to content

Commit 03da12d

Browse files
authored
Merge pull request #32 from thomasjpfan/compound_alerts
Adds compound alerts
2 parents 2888083 + e3d6565 commit 03da12d

File tree

3 files changed

+231
-15
lines changed

3 files changed

+231
-15
lines changed

docs/usage.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,12 @@ Please visit [Alerting Overview](https://prometheus.io/docs/alerting/overview/)
6262
!!! note
6363
I hope that the number of shortcuts will grow with time thanks to community contributions. Please create [an issue](https://github.com/vfarcic/docker-flow-monitor/issues) with the `alertIf` statement and the suggested shortcut and I'll add it to the code as soon as possible.
6464

65+
### AlertIf Logical Operators
66+
67+
The logical operators `and`, `unless`, and `or` can be used in combinations with AlertIf Parameter Shortcuts. For example, to create an alert that triggers when response time is low unless response time is high, set `alertIf=@resp_time_below:0.025,5m,0.75_unless_@resp_time_above:0.1,5m,0.99`. This alert prevents `@resp_time_below` from triggering while `@resp_time_above` is triggering. The `summary` annotation for this alert will be merged with the `and` operator: "Response time of the service my-service is below 0.025 unless Response time of the service my-service is above 0.1". When using logical operators, there are no default alert labels. The alert labels will have to be manually set by using the `alertLabels` query parameter.
68+
69+
More information on the logical operators can be found on Prometheus's querying [documentation](https://prometheus.io/docs/prometheus/latest/querying/operators/#logical-set-binary-operators).
70+
6571
## Remove
6672

6773
!!! tip

server/server.go

Lines changed: 111 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -324,40 +324,136 @@ var alertIfShortcutData = map[string]alertIfShortcut{
324324

325325
func (s *serve) formatAlert(alert *prometheus.Alert) {
326326
alert.AlertNameFormatted = s.getNameFormatted(fmt.Sprintf("%s_%s", alert.ServiceName, alert.AlertName))
327-
if strings.HasPrefix(alert.AlertIf, "@") {
327+
if !strings.HasPrefix(alert.AlertIf, "@") {
328+
return
329+
}
330+
331+
_, bOp, _ := splitCompoundOp(alert.AlertIf)
332+
if len(bOp) > 0 {
333+
formatCompoundAlert(alert)
334+
} else {
335+
formatSingleAlert(alert)
336+
}
337+
338+
}
339+
340+
func formatSingleAlert(alert *prometheus.Alert) {
341+
342+
value := ""
343+
alertSplit := strings.Split(alert.AlertIf, ":")
344+
shortcut := alertSplit[0]
345+
346+
if len(alertSplit) > 1 {
347+
value = alertSplit[1]
348+
}
349+
350+
data, ok := alertIfShortcutData[shortcut]
351+
if !ok {
352+
return
353+
}
354+
355+
alert.AlertIf = replaceTags(data.expanded, alert, value)
356+
357+
if alert.AlertAnnotations == nil {
358+
alert.AlertAnnotations = map[string]string{}
359+
}
360+
for k, v := range data.annotations {
361+
if _, ok := alert.AlertAnnotations[k]; !ok {
362+
alert.AlertAnnotations[k] = replaceTags(v, alert, value)
363+
}
364+
}
365+
366+
if alert.AlertLabels == nil {
367+
alert.AlertLabels = map[string]string{}
368+
}
369+
for k, v := range data.labels {
370+
if _, ok := alert.AlertLabels[k]; !ok {
371+
alert.AlertLabels[k] = replaceTags(v, alert, value)
372+
}
373+
}
374+
}
375+
376+
func formatCompoundAlert(alert *prometheus.Alert) {
377+
alertIfStr := alert.AlertIf
378+
alertAnnotations := map[string]string{}
379+
immutableAnnotations := map[string]struct{}{}
380+
381+
// copy alert annotations and alert labels
382+
if alert.AlertAnnotations != nil {
383+
for k := range alert.AlertAnnotations {
384+
immutableAnnotations[k] = struct{}{}
385+
}
386+
}
387+
388+
var alertIfFormattedBuffer bytes.Buffer
389+
390+
currentAlert, bOp, alertIfStr := splitCompoundOp(alertIfStr)
391+
392+
for len(currentAlert) > 0 {
328393
value := ""
329-
alertSplit := strings.Split(alert.AlertIf, ":")
394+
alertSplit := strings.Split(currentAlert, ":")
330395
shortcut := alertSplit[0]
331396

332397
if len(alertSplit) > 1 {
333398
value = alertSplit[1]
334399
}
335-
336400
data, ok := alertIfShortcutData[shortcut]
337401
if !ok {
338402
return
339403
}
340404

341-
alert.AlertIf = replaceTags(data.expanded, alert, value)
342-
343-
if alert.AlertAnnotations == nil {
344-
alert.AlertAnnotations = map[string]string{}
405+
alertIfFormattedBuffer.WriteString(replaceTags(data.expanded, alert, value))
406+
if len(bOp) > 0 {
407+
alertIfFormattedBuffer.WriteString(fmt.Sprintf(" %s ", bOp))
345408
}
409+
346410
for k, v := range data.annotations {
347-
if _, ok := alert.AlertAnnotations[k]; !ok {
348-
alert.AlertAnnotations[k] = replaceTags(v, alert, value)
411+
if _, ok := immutableAnnotations[k]; ok {
412+
continue
413+
}
414+
alertAnnotations[k] += replaceTags(v, alert, value)
415+
if len(bOp) > 0 {
416+
alertAnnotations[k] += fmt.Sprintf(" %s ", bOp)
349417
}
350418
}
419+
currentAlert, bOp, alertIfStr = splitCompoundOp(alertIfStr)
420+
}
351421

352-
if alert.AlertLabels == nil {
353-
alert.AlertLabels = map[string]string{}
422+
alert.AlertIf = alertIfFormattedBuffer.String()
423+
424+
if alert.AlertAnnotations == nil {
425+
alert.AlertAnnotations = map[string]string{}
426+
}
427+
428+
for k, v := range alertAnnotations {
429+
if _, ok := immutableAnnotations[k]; ok {
430+
continue
354431
}
355-
for k, v := range data.labels {
356-
if _, ok := alert.AlertLabels[k]; !ok {
357-
alert.AlertLabels[k] = replaceTags(v, alert, value)
358-
}
432+
alert.AlertAnnotations[k] = v
433+
}
434+
435+
}
436+
437+
// splitCompoundOp find splits string into three pieces if it includes _unless_,
438+
// _and_, or _or_. For example, hello_and_world_or_earth will return [hello, and, world_or_earth]
439+
func splitCompoundOp(s string) (string, string, string) {
440+
binaryOps := []string{"unless", "and", "or"}
441+
442+
minIdx := len(s)
443+
minOp := ""
444+
for _, bOp := range binaryOps {
445+
idx := strings.Index(s, fmt.Sprintf("_%s_", bOp))
446+
if idx != -1 && idx < minIdx {
447+
minIdx = idx
448+
minOp = bOp
359449
}
360450
}
451+
452+
if len(minOp) > 0 {
453+
return s[:minIdx], minOp, s[minIdx+len(minOp)+2:]
454+
}
455+
return s, "", ""
456+
361457
}
362458

363459
func replaceTags(tag string, alert *prometheus.Alert, value string) string {

server/server_test.go

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"net/http/httptest"
88
"net/url"
99
"os"
10+
"strings"
1011
"testing"
1112
"time"
1213

@@ -310,6 +311,119 @@ func (s *ServerTestSuite) Test_ReconfigureHandler_ExpandsShortcuts() {
310311
}
311312
}
312313

314+
func (s *ServerTestSuite) Test_ReconfigureHandler_ExpandsShortcuts_CompoundOps() {
315+
testData := []struct {
316+
expected string
317+
shortcut string
318+
annotations map[string]string
319+
labels map[string]string
320+
}{
321+
{
322+
`sum(rate(http_server_resp_time_bucket{job="my-service", le="0.025"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) > 0.75 unless sum(rate(http_server_resp_time_bucket{job="my-service", le="0.1"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) < 0.99`,
323+
`@resp_time_below:0.025,5m,0.75_unless_@resp_time_above:0.1,5m,0.99`,
324+
map[string]string{"summary": "Response time of the service my-service is below 0.025 unless Response time of the service my-service is above 0.1"},
325+
map[string]string{},
326+
},
327+
{
328+
`sum(rate(http_server_resp_time_bucket{job="my-service", le="0.025"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) > 0.75 unless sum(rate(http_server_resp_time_bucket{job="my-service", le="0.1"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) < 0.99`,
329+
`@resp_time_below:0.025,5m,0.75_unless_@resp_time_above:0.1,5m,0.99`,
330+
map[string]string{"summary": "Response time of the service my-service is below 0.025 unless Response time of the service my-service is above 0.1"},
331+
map[string]string{"receiver": "system", "service": "my-service", "type": "service"},
332+
},
333+
{
334+
`sum(rate(http_server_resp_time_bucket{job="my-service", le="0.1"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) < 0.99 and container_memory_usage_bytes{container_label_com_docker_swarm_service_name="my-service"}/container_spec_memory_limit_bytes{container_label_com_docker_swarm_service_name="my-service"} > 0.8`,
335+
`@resp_time_above:0.1,5m,0.99_and_@service_mem_limit:0.8`,
336+
map[string]string{"summary": "Response time of the service my-service is above 0.1 and Memory of the service my-service is over 0.8"},
337+
map[string]string{"receiver": "system", "service": "my-service"},
338+
},
339+
{
340+
`sum(rate(http_server_resp_time_bucket{job="my-service", le="0.1"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) < 0.99 or container_memory_usage_bytes{container_label_com_docker_swarm_service_name="my-service"}/container_spec_memory_limit_bytes{container_label_com_docker_swarm_service_name="my-service"} > 0.8`,
341+
`@resp_time_above:0.1,5m,0.99_or_@service_mem_limit:0.8`,
342+
map[string]string{"summary": "Response time of the service my-service is above 0.1 or Memory of the service my-service is over 0.8"},
343+
map[string]string{"receiver": "system"},
344+
},
345+
{
346+
`container_memory_usage_bytes{container_label_com_docker_swarm_service_name="my-service"}/container_spec_memory_limit_bytes{container_label_com_docker_swarm_service_name="my-service"} > 0.8 and sum(rate(http_server_resp_time_bucket{job="my-service", le="0.025"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) > 0.75 unless sum(rate(http_server_resp_time_bucket{job="my-service", le="0.1"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) < 0.99`,
347+
`@service_mem_limit:0.8_and_@resp_time_below:0.025,5m,0.75_unless_@resp_time_above:0.1,5m,0.99`,
348+
map[string]string{"summary": "Memory of the service my-service is over 0.8 and Response time of the service my-service is below 0.025 unless Response time of the service my-service is above 0.1"},
349+
map[string]string{"receiver": "system"},
350+
},
351+
}
352+
353+
for _, data := range testData {
354+
expected := prometheus.Alert{
355+
AlertAnnotations: data.annotations,
356+
AlertFor: "my-for",
357+
AlertIf: data.expected,
358+
AlertLabels: data.labels,
359+
AlertName: "my-alert",
360+
AlertNameFormatted: "myservice_myalert",
361+
ServiceName: "my-service",
362+
Replicas: 3,
363+
}
364+
rwMock := ResponseWriterMock{}
365+
alertQueries := []string{}
366+
for k, v := range data.labels {
367+
alertQueries = append(alertQueries, fmt.Sprintf("%s=%s", k, v))
368+
}
369+
alertQueryStr := strings.Join(alertQueries, ",")
370+
addr := fmt.Sprintf(
371+
"/v1/docker-flow-monitor?serviceName=%s&alertName=%s&alertIf=%s&alertFor=%s&replicas=3",
372+
expected.ServiceName,
373+
expected.AlertName,
374+
data.shortcut,
375+
expected.AlertFor,
376+
)
377+
if len(alertQueries) > 0 {
378+
addr += fmt.Sprintf("&alertLabels=%s", alertQueryStr)
379+
}
380+
req, _ := http.NewRequest("GET", addr, nil)
381+
382+
serve := New()
383+
serve.ReconfigureHandler(rwMock, req)
384+
385+
s.Equal(expected, serve.alerts[expected.AlertNameFormatted])
386+
}
387+
}
388+
389+
func (s *ServerTestSuite) Test_ReconfigureHandler_DoesNotExpandAnnotations_WhenTheyAreAlreadySet_CompoundOps() {
390+
testData := struct {
391+
expected string
392+
shortcut string
393+
annotations map[string]string
394+
labels map[string]string
395+
}{
396+
`sum(rate(http_server_resp_time_bucket{job="my-service", le="0.025"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) > 0.75 unless sum(rate(http_server_resp_time_bucket{job="my-service", le="0.1"}[5m])) / sum(rate(http_server_resp_time_count{job="my-service"}[5m])) < 0.99`,
397+
`@resp_time_below:0.025,5m,0.75_unless_@resp_time_above:0.1,5m,0.99`,
398+
map[string]string{"summary": "not-again"},
399+
map[string]string{"receiver": "system", "service": "ugly-service"},
400+
}
401+
expected := prometheus.Alert{
402+
AlertAnnotations: testData.annotations,
403+
AlertFor: "my-for",
404+
AlertIf: testData.expected,
405+
AlertLabels: testData.labels,
406+
AlertName: "my-alert",
407+
AlertNameFormatted: "myservice_myalert",
408+
ServiceName: "my-service",
409+
Replicas: 3,
410+
}
411+
rwMock := ResponseWriterMock{}
412+
addr := fmt.Sprintf(
413+
"/v1/docker-flow-monitor?serviceName=%s&alertName=%s&alertIf=%s&alertFor=%s&replicas=3&alertAnnotations=summary=not-again&alertLabels=service=ugly-service,receiver=system",
414+
expected.ServiceName,
415+
expected.AlertName,
416+
testData.shortcut,
417+
expected.AlertFor,
418+
)
419+
req, _ := http.NewRequest("GET", addr, nil)
420+
421+
serve := New()
422+
serve.ReconfigureHandler(rwMock, req)
423+
424+
s.Equal(expected, serve.alerts[expected.AlertNameFormatted])
425+
}
426+
313427
func (s *ServerTestSuite) Test_ReconfigureHandler_DoesNotExpandAnnotationsAndLabels_WhenTheyAreAlreadySet() {
314428
testData := struct {
315429
expected string

0 commit comments

Comments
 (0)