-
Notifications
You must be signed in to change notification settings - Fork 70
155 lines (146 loc) · 6.02 KB
/
Copy pathalertingrules.yml
File metadata and controls
155 lines (146 loc) · 6.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
groups:
- name: trivela_backend
interval: 30s
rules:
# Backend error rate > 5% over 5 minutes
- alert: HighBackendErrorRate
expr: |
(
sum(rate(http_requests_total{job="trivela-backend", status=~"5.."}[5m]))
/
sum(rate(http_requests_total{job="trivela-backend"}[5m]))
) > 0.05
for: 5m
labels:
severity: critical
team: backend
annotations:
summary: 'Trivela backend error rate above 5%'
description:
'Error rate is {{ $value | humanizePercentage }} over the last 5 minutes. Investigate
backend logs immediately.'
runbook_url: 'https://github.com/FinesseStudioLab/Trivela/blob/main/docs/RUNBOOK.md#backend-restart'
# Process restart detected (uptime reset)
- alert: BackendProcessRestart
expr: |
increase(process_start_time_seconds{job="trivela-backend"}[5m]) > 0
for: 0m
labels:
severity: warning
team: backend
annotations:
summary: 'Trivela backend process restarted'
description:
'The backend process restarted at {{ $value }}. Verify deployment or investigate crash.'
runbook_url: 'https://github.com/FinesseStudioLab/Trivela/blob/main/docs/RUNBOOK.md#backend-restart'
# Brute-force / credential-stuffing: spike in failed auth attempts (#588)
- alert: AuthFailureSpike
expr: |
sum(rate(trivela_auth_failures_total{job="trivela-backend"}[5m])) > 1
for: 5m
labels:
severity: warning
team: backend
annotations:
summary: 'Spike in failed authentication attempts'
description:
'Failed auth attempts are averaging {{ $value | humanize }}/s over the last 5 minutes —
possible brute-force or credential-stuffing. Check the trivela_auth_lockouts_total
counter and backend warn logs.'
runbook_url: 'https://github.com/FinesseStudioLab/Trivela/blob/main/docs/RUNBOOK.md#auth-brute-force-lockout'
# Brute-force lockout actively triggering (#588)
- alert: AuthLockoutTriggered
expr: |
increase(trivela_auth_lockouts_total{job="trivela-backend"}[5m]) > 0
for: 0m
labels:
severity: critical
team: backend
annotations:
summary: 'Authentication lockout triggered'
description:
'{{ $value }} brute-force lockout(s) triggered in the last 5 minutes. One or more
clients are being temporarily locked out of auth endpoints. Investigate source IPs in
the backend warn logs.'
runbook_url: 'https://github.com/FinesseStudioLab/Trivela/blob/main/docs/RUNBOOK.md#auth-brute-force-lockout'
# Backend is down (no scrape)
- alert: BackendDown
expr: up{job="trivela-backend"} == 0
for: 1m
labels:
severity: critical
team: backend
annotations:
summary: 'Trivela backend is unreachable'
description:
'Prometheus cannot scrape the backend /metrics endpoint. Service may be down.'
runbook_url: 'https://github.com/FinesseStudioLab/Trivela/blob/main/docs/RUNBOOK.md#backend-restart'
- name: trivela_rpc
interval: 30s
rules:
# Soroban RPC degraded for > 2 minutes
- alert: SorobanRPCDegraded
expr: |
trivela_rpc_health_status{status="degraded"} == 1
for: 2m
labels:
severity: warning
team: infrastructure
annotations:
summary: 'Soroban RPC is degraded'
description:
"The Soroban RPC endpoint has been reporting 'degraded' status for more than 2 minutes."
runbook_url: 'https://github.com/FinesseStudioLab/Trivela/blob/main/docs/RUNBOOK.md#rpc-failover'
# Soroban RPC returning consecutive errors for > 60s
- alert: SorobanRPCConsecutiveErrors
expr: |
trivela_rpc_consecutive_errors_total > 0
for: 1m
labels:
severity: critical
team: infrastructure
annotations:
summary: 'Soroban RPC consecutive errors for >60s'
description:
'The Soroban RPC has been returning consecutive errors for over 60 seconds. Contract
interactions will fail.'
runbook_url: 'https://github.com/FinesseStudioLab/Trivela/blob/main/docs/RUNBOOK.md#rpc-failover'
# RPC health check endpoint down
- alert: RPCHealthCheckDown
expr: up{job="trivela-rpc-health"} == 0
for: 2m
labels:
severity: critical
team: infrastructure
annotations:
summary: 'Trivela RPC health check endpoint unreachable'
description: 'Cannot reach the RPC health endpoint. Check Soroban node connectivity.'
- name: trivela_contracts
interval: 60s
rules:
# Contract paused event detected
- alert: ContractPaused
expr: |
increase(trivela_contract_events_total{event_type="paused"}[5m]) > 0
for: 0m
labels:
severity: critical
team: contracts
annotations:
summary: 'Trivela campaign contract has been PAUSED'
description:
'A contract pause event was indexed. Contract ID: {{ $labels.contract_id }}. All
campaign interactions are blocked.'
runbook_url: 'https://github.com/FinesseStudioLab/Trivela/blob/main/docs/RUNBOOK.md#contract-pause-response'
# Campaign DB write errors
- alert: CampaignDBWriteErrors
expr: |
increase(trivela_db_write_errors_total{table="campaigns"}[5m]) > 5
for: 2m
labels:
severity: warning
team: backend
annotations:
summary: 'Campaign database write errors detected'
description: '{{ $value }} DB write errors on the campaigns table in the last 5 minutes.'
runbook_url: 'https://github.com/FinesseStudioLab/Trivela/blob/main/docs/RUNBOOK.md#db-backup-restore'