Skip to content

Commit 33541e8

Browse files
health mode
1 parent 2ac0ce2 commit 33541e8

File tree

7 files changed

+517
-0
lines changed

7 files changed

+517
-0
lines changed

api/flowcollector/v1beta2/flowcollector_alert_types.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import (
1212

1313
type AlertTemplate string
1414
type AlertGroupBy string
15+
type HealthMode string
1516

1617
const (
1718
AlertNoFlows AlertTemplate = "NetObservNoFlows"
@@ -28,6 +29,8 @@ const (
2829
GroupByNode AlertGroupBy = "Node"
2930
GroupByNamespace AlertGroupBy = "Namespace"
3031
GroupByWorkload AlertGroupBy = "Workload"
32+
HealthModeAlerts HealthMode = "alerts"
33+
HealthModeRecordingRules HealthMode = "recording-rules"
3134
)
3235

3336
type FLPAlert struct {

api/flowcollector/v1beta2/flowcollector_types.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -590,6 +590,17 @@ type FLPMetrics struct {
590590
// More information on alerts: https://github.com/netobserv/network-observability-operator/blob/main/docs/Alerts.md
591591
// +optional
592592
Alerts *[]FLPAlert `json:"alerts"`
593+
594+
// `healthMode` defines how to expose network health information.
595+
// Possible values are `alerts` (default) or `recording-rules`.
596+
// - `alerts`: Generate Prometheus alerts that fire when thresholds are exceeded (current behavior).
597+
// - `recording-rules`: Generate Prometheus recording rules that pre-compute health metrics for passive consumption.
598+
// Recording rules avoid alert fatigue and are useful for dashboard-based health monitoring.
599+
// This is currently an experimental feature behind a feature gate. To enable, edit `spec.processor.advanced.env` by adding `EXPERIMENTAL_ALERTS_HEALTH` set to `true`.
600+
// +kubebuilder:validation:Enum:="alerts";"recording-rules"
601+
// +kubebuilder:default:="alerts"
602+
// +optional
603+
HealthMode string `json:"healthMode,omitempty"`
593604
}
594605

595606
type FLPLogTypes string

config/crd/bases/flows.netobserv.io_flowcollectors.yaml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5427,6 +5427,19 @@ spec:
54275427
items:
54285428
type: string
54295429
type: array
5430+
healthMode:
5431+
default: alerts
5432+
description: |-
5433+
`healthMode` defines how to expose network health information.
5434+
Possible values are `alerts` (default) or `recording-rules`.
5435+
- `alerts`: Generate Prometheus alerts that fire when thresholds are exceeded (current behavior).
5436+
- `recording-rules`: Generate Prometheus recording rules that pre-compute health metrics for passive consumption.
5437+
Recording rules avoid alert fatigue and are useful for dashboard-based health monitoring.
5438+
This is currently an experimental feature behind a feature gate. To enable, edit `spec.processor.advanced.env` by adding `EXPERIMENTAL_ALERTS_HEALTH` set to `true`.
5439+
enum:
5440+
- alerts
5441+
- recording-rules
5442+
type: string
54305443
includeList:
54315444
description: |-
54325445
`includeList` is a list of metric names to specify which ones to generate.

docs/FlowCollector.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11485,6 +11485,21 @@ Possible values are: `NetObservNoFlows`, `NetObservLokiError`, `PacketDropsByKer
1148511485
More information on alerts: https://github.com/netobserv/network-observability-operator/blob/main/docs/Alerts.md<br/>
1148611486
</td>
1148711487
<td>false</td>
11488+
</tr><tr>
11489+
<td><b>healthMode</b></td>
11490+
<td>enum</td>
11491+
<td>
11492+
`healthMode` defines how to expose network health information.
11493+
Possible values are `alerts` (default) or `recording-rules`.
11494+
- `alerts`: Generate Prometheus alerts that fire when thresholds are exceeded (current behavior).
11495+
- `recording-rules`: Generate Prometheus recording rules that pre-compute health metrics for passive consumption.
11496+
Recording rules avoid alert fatigue and are useful for dashboard-based health monitoring.
11497+
This is currently an experimental feature behind a feature gate. To enable, edit `spec.processor.advanced.env` by adding `EXPERIMENTAL_ALERTS_HEALTH` set to `true`.<br/>
11498+
<br/>
11499+
<i>Enum</i>: alerts, recording-rules<br/>
11500+
<i>Default</i>: alerts<br/>
11501+
</td>
11502+
<td>false</td>
1148811503
</tr><tr>
1148911504
<td><b>includeList</b></td>
1149011505
<td>[]enum</td>

internal/pkg/metrics/alerts/builder.go

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,23 @@ type ruleBuilder struct {
3333
duration monitoringv1.Duration
3434
}
3535

36+
// BuildRules is the main entry point that decides whether to build alerts or recording rules
37+
// based on the healthMode configuration
3638
func BuildRules(ctx context.Context, fc *flowslatest.FlowCollectorSpec) []monitoringv1.Rule {
3739
log := log.FromContext(ctx)
40+
41+
if fc.Processor.Metrics.HealthMode == string(flowslatest.HealthModeRecordingRules) {
42+
log.Info("Building recording rules for health monitoring")
43+
return BuildRecordingRules(ctx, fc)
44+
}
45+
46+
log.Info("Building alerts for health monitoring")
47+
return BuildAlertRules(ctx, fc)
48+
}
49+
50+
// BuildAlertRules builds Prometheus alert rules for health monitoring
51+
func BuildAlertRules(ctx context.Context, fc *flowslatest.FlowCollectorSpec) []monitoringv1.Rule {
52+
log := log.FromContext(ctx)
3853
rules := []monitoringv1.Rule{}
3954

4055
if fc.HasExperimentalAlertsHealth() {
@@ -66,6 +81,40 @@ func BuildRules(ctx context.Context, fc *flowslatest.FlowCollectorSpec) []monito
6681
return rules
6782
}
6883

84+
// BuildRecordingRules builds Prometheus recording rules for health monitoring
85+
func BuildRecordingRules(ctx context.Context, fc *flowslatest.FlowCollectorSpec) []monitoringv1.Rule {
86+
log := log.FromContext(ctx)
87+
rules := []monitoringv1.Rule{}
88+
89+
if fc.HasExperimentalAlertsHealth() {
90+
alerts := fc.GetFLPAlerts()
91+
metrics := fc.GetIncludeList()
92+
for _, alert := range alerts {
93+
if ok, _ := alert.IsAllowed(fc); !ok {
94+
continue
95+
}
96+
for _, variant := range alert.Variants {
97+
if r, err := convertToRecordingRules(alert.Template, &variant, metrics); err != nil {
98+
log.Error(err, "unable to configure a recording rule")
99+
} else if len(r) > 0 {
100+
rules = append(rules, r...)
101+
}
102+
}
103+
}
104+
}
105+
106+
if !slices.Contains(fc.Processor.Metrics.DisableAlerts, flowslatest.AlertNoFlows) {
107+
r := recordingNoFlows()
108+
rules = append(rules, *r)
109+
}
110+
if !slices.Contains(fc.Processor.Metrics.DisableAlerts, flowslatest.AlertLokiError) {
111+
r := recordingLokiError()
112+
rules = append(rules, *r)
113+
}
114+
115+
return rules
116+
}
117+
69118
func convertToRules(template flowslatest.AlertTemplate, alert *flowslatest.AlertVariant, enabledMetrics []string) ([]monitoringv1.Rule, error) {
70119
var rules []monitoringv1.Rule
71120
var upperThreshold string
Lines changed: 257 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,257 @@
1+
package alerts
2+
3+
import (
4+
"fmt"
5+
"strings"
6+
7+
flowslatest "github.com/netobserv/network-observability-operator/api/flowcollector/v1beta2"
8+
monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
9+
"k8s.io/apimachinery/pkg/util/intstr"
10+
)
11+
12+
// convertToRecordingRules converts alert configuration to recording rules
13+
func convertToRecordingRules(template flowslatest.AlertTemplate, alert *flowslatest.AlertVariant, enabledMetrics []string) ([]monitoringv1.Rule, error) {
14+
var rules []monitoringv1.Rule
15+
sides := []srcOrDst{asSource, asDest}
16+
if alert.GroupBy == "" {
17+
// No side for global group
18+
sides = []srcOrDst{""}
19+
}
20+
21+
// For recording rules, we create one rule per side (not per severity)
22+
for _, side := range sides {
23+
rb := ruleBuilder{
24+
template: template,
25+
alert: alert,
26+
enabledMetrics: enabledMetrics,
27+
side: side,
28+
duration: monitoringv1.Duration("5m"),
29+
}
30+
if r, err := rb.convertToRecordingRule(); err != nil {
31+
return nil, err
32+
} else if r != nil {
33+
rules = append(rules, *r)
34+
}
35+
}
36+
return rules, nil
37+
}
38+
39+
func (rb *ruleBuilder) convertToRecordingRule() (*monitoringv1.Rule, error) {
40+
switch rb.template {
41+
case flowslatest.AlertPacketDropsByKernel:
42+
return rb.kernelDropsRecording()
43+
case flowslatest.AlertIPsecErrors:
44+
return rb.ipsecErrorsRecording()
45+
case flowslatest.AlertDNSErrors:
46+
return rb.dnsErrorsRecording()
47+
case flowslatest.AlertNetpolDenied:
48+
return rb.netpolDeniedRecording()
49+
case flowslatest.AlertLatencyHighTrend:
50+
return rb.latencyTrendRecording()
51+
case flowslatest.AlertPacketDropsByDevice:
52+
return rb.deviceDropsRecording()
53+
case flowslatest.AlertExternalEgressHighTrend, flowslatest.AlertExternalIngressHighTrend, flowslatest.AlertCrossAZ:
54+
// TODO: implement these
55+
return nil, nil
56+
case flowslatest.AlertLokiError, flowslatest.AlertNoFlows:
57+
// These are handled separately in BuildRecordingRules
58+
return nil, nil
59+
}
60+
return nil, fmt.Errorf("unknown recording rule template: %s", rb.template)
61+
}
62+
63+
func (rb *ruleBuilder) buildRecordingRuleName() string {
64+
// Format: netobserv:health:<template>:<groupby>:<side>:rate5m
65+
// Example: netobserv:health:packet_drops_by_kernel:namespace:src:rate5m
66+
67+
templateLower := strings.ToLower(string(rb.template))
68+
// Convert CamelCase to snake_case
69+
templateSnake := camelToSnake(templateLower)
70+
71+
var parts []string
72+
parts = append(parts, "netobserv", "health", templateSnake)
73+
74+
if rb.alert.GroupBy != "" {
75+
parts = append(parts, strings.ToLower(string(rb.alert.GroupBy)))
76+
}
77+
78+
if rb.side != "" {
79+
parts = append(parts, strings.ToLower(string(rb.side)))
80+
}
81+
82+
parts = append(parts, "rate5m")
83+
84+
return strings.Join(parts, ":")
85+
}
86+
87+
func (rb *ruleBuilder) buildRecordingRuleLabels() map[string]string {
88+
labels := map[string]string{
89+
"netobserv": "health",
90+
"health_template": string(rb.template),
91+
}
92+
93+
if rb.alert.GroupBy != "" {
94+
labels["health_groupby"] = string(rb.alert.GroupBy)
95+
}
96+
97+
if rb.side != "" {
98+
labels["health_side"] = string(rb.side)
99+
}
100+
101+
return labels
102+
}
103+
104+
func (rb *ruleBuilder) kernelDropsRecording() (*monitoringv1.Rule, error) {
105+
metric, totalMetric := rb.getMetricsForAlert()
106+
filter := rb.buildLabelFilter("")
107+
metricsRate := promQLRateFromMetric(metric, "", filter, "5m", "")
108+
totalRate := promQLRateFromMetric(totalMetric, "", filter, "5m", "")
109+
metricsSumBy := sumBy(metricsRate, rb.alert.GroupBy, rb.side, "")
110+
totalSumBy := sumBy(totalRate, rb.alert.GroupBy, rb.side, "")
111+
112+
// Recording rule: compute the percentage without threshold comparison
113+
promql := fmt.Sprintf("100 * (%s) / (%s)", metricsSumBy, totalSumBy)
114+
115+
return &monitoringv1.Rule{
116+
Record: rb.buildRecordingRuleName(),
117+
Expr: intstr.FromString(promql),
118+
Labels: rb.buildRecordingRuleLabels(),
119+
}, nil
120+
}
121+
122+
func (rb *ruleBuilder) deviceDropsRecording() (*monitoringv1.Rule, error) {
123+
// No "side" consideration on netdev metrics, so keep only 1 call from the two of them
124+
if rb.side == asDest {
125+
return nil, nil
126+
}
127+
128+
var byLabels string
129+
switch rb.alert.GroupBy {
130+
case flowslatest.GroupByNode:
131+
byLabels = " by (instance)"
132+
case flowslatest.GroupByNamespace:
133+
return nil, fmt.Errorf("PacketDropsByDevice recording rule does not support grouping per namespace")
134+
case flowslatest.GroupByWorkload:
135+
return nil, fmt.Errorf("PacketDropsByDevice recording rule does not support grouping per workload")
136+
}
137+
138+
promql := fmt.Sprintf(
139+
"100 * (sum(rate(node_network_receive_drop_total[5m]))%s + sum(rate(node_network_transmit_drop_total[5m]))%s) / (sum(rate(node_network_receive_packets_total[5m]))%s + sum(rate(node_network_transmit_packets_total[5m]))%s)",
140+
byLabels, byLabels, byLabels, byLabels,
141+
)
142+
143+
return &monitoringv1.Rule{
144+
Record: rb.buildRecordingRuleName(),
145+
Expr: intstr.FromString(promql),
146+
Labels: rb.buildRecordingRuleLabels(),
147+
}, nil
148+
}
149+
150+
func (rb *ruleBuilder) ipsecErrorsRecording() (*monitoringv1.Rule, error) {
151+
metric, totalMetric := rb.getMetricsForAlert()
152+
filter := rb.buildLabelFilter("")
153+
metricsRate := promQLRateFromMetric(metric, "", filter, "5m", "")
154+
totalRate := promQLRateFromMetric(totalMetric, "", filter, "5m", "")
155+
metricsSumBy := sumBy(metricsRate, rb.alert.GroupBy, rb.side, "")
156+
totalSumBy := sumBy(totalRate, rb.alert.GroupBy, rb.side, "")
157+
promql := fmt.Sprintf("100 * (%s) / (%s)", metricsSumBy, totalSumBy)
158+
159+
return &monitoringv1.Rule{
160+
Record: rb.buildRecordingRuleName(),
161+
Expr: intstr.FromString(promql),
162+
Labels: rb.buildRecordingRuleLabels(),
163+
}, nil
164+
}
165+
166+
func (rb *ruleBuilder) dnsErrorsRecording() (*monitoringv1.Rule, error) {
167+
// DNS errors are in return traffic only
168+
if rb.side == asSource {
169+
return nil, nil
170+
}
171+
172+
metric, totalMetric := rb.getMetricsForAlert()
173+
metricsFilter := rb.buildLabelFilter(`DnsFlagsResponseCode!="NoError"`)
174+
totalFilter := rb.buildLabelFilter("")
175+
metricsRate := promQLRateFromMetric(metric, "_count", metricsFilter, "5m", "")
176+
totalRate := promQLRateFromMetric(totalMetric, "_count", totalFilter, "5m", "")
177+
metricsSumBy := sumBy(metricsRate, rb.alert.GroupBy, rb.side, "")
178+
totalSumBy := sumBy(totalRate, rb.alert.GroupBy, rb.side, "")
179+
promql := fmt.Sprintf("100 * (%s) / (%s)", metricsSumBy, totalSumBy)
180+
181+
return &monitoringv1.Rule{
182+
Record: rb.buildRecordingRuleName(),
183+
Expr: intstr.FromString(promql),
184+
Labels: rb.buildRecordingRuleLabels(),
185+
}, nil
186+
}
187+
188+
func (rb *ruleBuilder) netpolDeniedRecording() (*monitoringv1.Rule, error) {
189+
metric, totalMetric := rb.getMetricsForAlert()
190+
metricsFilter := rb.buildLabelFilter(`action="drop"`)
191+
totalFilter := rb.buildLabelFilter("")
192+
metricsRate := promQLRateFromMetric(metric, "", metricsFilter, "5m", "")
193+
totalRate := promQLRateFromMetric(totalMetric, "", totalFilter, "5m", "")
194+
metricsSumBy := sumBy(metricsRate, rb.alert.GroupBy, rb.side, "")
195+
totalSumBy := sumBy(totalRate, rb.alert.GroupBy, rb.side, "")
196+
promql := fmt.Sprintf("100 * (%s) / (%s)", metricsSumBy, totalSumBy)
197+
198+
return &monitoringv1.Rule{
199+
Record: rb.buildRecordingRuleName(),
200+
Expr: intstr.FromString(promql),
201+
Labels: rb.buildRecordingRuleLabels(),
202+
}, nil
203+
}
204+
205+
func (rb *ruleBuilder) latencyTrendRecording() (*monitoringv1.Rule, error) {
206+
offset, duration := rb.alert.GetTrendParams()
207+
208+
metric, baseline := rb.getMetricsForAlert()
209+
filter := rb.buildLabelFilter("")
210+
metricsRate := promQLRateFromMetric(metric, "_bucket", filter, "5m", "")
211+
baselineRate := promQLRateFromMetric(baseline, "_bucket", filter, duration, " offset "+offset)
212+
metricQuantile := histogramQuantile(metricsRate, rb.alert.GroupBy, rb.side, "0.9")
213+
baselineQuantile := histogramQuantile(baselineRate, rb.alert.GroupBy, rb.side, "0.9")
214+
215+
// Recording rule: compute the percentage increase without threshold comparison
216+
promql := fmt.Sprintf("100 * ((%s) - (%s)) / (%s)", metricQuantile, baselineQuantile, baselineQuantile)
217+
218+
return &monitoringv1.Rule{
219+
Record: rb.buildRecordingRuleName(),
220+
Expr: intstr.FromString(promql),
221+
Labels: rb.buildRecordingRuleLabels(),
222+
}, nil
223+
}
224+
225+
func recordingNoFlows() *monitoringv1.Rule {
226+
return &monitoringv1.Rule{
227+
Record: "netobserv:health:no_flows:rate1m",
228+
Expr: intstr.FromString("sum(rate(netobserv_ingest_flows_processed[1m]))"),
229+
Labels: map[string]string{
230+
"netobserv": "health",
231+
"health_template": "NetObservNoFlows",
232+
},
233+
}
234+
}
235+
236+
func recordingLokiError() *monitoringv1.Rule {
237+
return &monitoringv1.Rule{
238+
Record: "netobserv:health:loki_errors:rate1m",
239+
Expr: intstr.FromString("sum(rate(netobserv_loki_dropped_entries_total[1m]))"),
240+
Labels: map[string]string{
241+
"netobserv": "health",
242+
"health_template": "NetObservLokiError",
243+
},
244+
}
245+
}
246+
247+
// camelToSnake converts CamelCase to snake_case
248+
func camelToSnake(s string) string {
249+
var result strings.Builder
250+
for i, r := range s {
251+
if i > 0 && r >= 'A' && r <= 'Z' {
252+
result.WriteRune('_')
253+
}
254+
result.WriteRune(r)
255+
}
256+
return strings.ToLower(result.String())
257+
}

0 commit comments

Comments
 (0)