Skip to content

Commit e7eb641

Browse files
authored
fix(cmd): add null check for empty path in fetchDirectory function (#789)
Signed-off-by: wuhuizuo <[email protected]>
1 parent 15d1d59 commit e7eb641

File tree

3 files changed

+336
-0
lines changed

3 files changed

+336
-0
lines changed

cmd/monitoring.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,10 @@ func getTag(defaultTag string, fixTag string) string {
247247
}
248248

249249
func fetchDirectory(rservice *common.GitRepoService, owner string, repoName string, path string, ref string) []*common.RepositoryContent {
250+
if path == "" {
251+
return nil
252+
}
253+
250254
log.Printf("fetch dir %s from %s/%s at rev:%s", path, owner, repoName, ref)
251255
fileContent, monitorDirectory, err := rservice.GetContents(owner, repoName, path, &common.RepositoryContentGetOptions{
252256
Ref: ref,
-78.3 KB
Binary file not shown.
Lines changed: 332 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,332 @@
1+
groups:
2+
- name: alert.rules
3+
rules:
4+
- alert: TiKV_critical_error
5+
expr: sum(rate(tikv_critical_error_total[1m])) BY (type, instance) > 0
6+
labels:
7+
env: ENV_LABELS_ENV
8+
expr: sum(rate(tikv_critical_error_total[1m])) BY (type, instance) > 0
9+
level: critical
10+
annotations:
11+
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
12+
summary: TiKV encounters critical error
13+
value: '{{ $value }}'
14+
- alert: TiKV_memory_used_too_fast
15+
expr: process_resident_memory_bytes{job=~"tikv",instance=~".*"} - (process_resident_memory_bytes{job=~"tikv",instance=~".*"} offset 5m) > 5*1024*1024*1024
16+
for: 5m
17+
labels:
18+
env: ENV_LABELS_ENV
19+
expr: process_resident_memory_bytes{job=~"tikv",instance=~".*"} - (process_resident_memory_bytes{job=~"tikv",instance=~".*"} offset 5m) > 5*1024*1024*1024
20+
level: emergency
21+
annotations:
22+
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
23+
summary: TiKV memory used too fast
24+
value: '{{ $value }}'
25+
- alert: TiKV_GC_can_not_work
26+
expr: sum(increase(tikv_gcworker_gc_tasks_vec{task="gc"}[1d])) < 1 and (sum(increase(tikv_gc_compaction_filter_perform[1d])) < 1 and sum(increase(tikv_engine_event_total{db="kv", cf="write", type="compaction"}[1d])) >= 1)
27+
for: 5m
28+
labels:
29+
env: ENV_LABELS_ENV
30+
expr: sum(increase(tikv_gcworker_gc_tasks_vec{task="gc"}[1d])) < 1
31+
level: emergency
32+
annotations:
33+
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
34+
summary: TiKV GC can not work
35+
value: '{{ $value }}'
36+
- alert: TiKV_server_report_failure_msg_total
37+
expr: sum(rate(tikv_server_report_failure_msg_total{type="unreachable"}[10m])) BY (store_id) > 10
38+
for: 1m
39+
labels:
40+
env: ENV_LABELS_ENV
41+
expr: sum(rate(tikv_server_report_failure_msg_total{type="unreachable"}[10m])) BY (store_id) > 10
42+
level: critical
43+
annotations:
44+
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
45+
summary: TiKV server_report_failure_msg_total error
46+
value: '{{ $value }}'
47+
- alert: TiKV_channel_full_total
48+
expr: sum(rate(tikv_channel_full_total[10m])) BY (type, instance) > 0
49+
for: 1m
50+
labels:
51+
env: ENV_LABELS_ENV
52+
expr: sum(rate(tikv_channel_full_total[10m])) BY (type, instance) > 0
53+
level: critical
54+
annotations:
55+
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
56+
summary: TiKV channel full
57+
value: '{{ $value }}'
58+
- alert: TiKV_write_stall
59+
expr: delta( tikv_engine_write_stall[10m]) > 0
60+
for: 1m
61+
labels:
62+
env: ENV_LABELS_ENV
63+
expr: delta( tikv_engine_write_stall[10m]) > 0
64+
level: critical
65+
annotations:
66+
description: 'cluster: ENV_LABELS_ENV, type: {{ $labels.type }}, instance: {{ $labels.instance }}, values: {{ $value }}'
67+
summary: TiKV write stall
68+
value: '{{ $value }}'
69+
- alert: TiKV_maybe_write_stall
70+
expr: max(tikv_scheduler_l0_avg) by (instance) > 80
71+
for: 1m
72+
labels:
73+
env: ENV_LABELS_ENV
74+
expr: max(tikv_scheduler_l0_avg) by (instance) > 80
75+
level: critical
76+
annotations:
77+
description: 'cluster: ENV_LABELS_ENV, type: {{ $labels.type }}, instance: {{ $labels.instance }}, values: {{ $value }}'
78+
summary: TiKV the average number of L0 files exceeds 80
79+
value: '{{ $value }}'
80+
- alert: TiKV_raft_log_lag
81+
expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_log_lag_bucket[1m])) by (le, instance)) > 5000
82+
for: 1m
83+
labels:
84+
env: ENV_LABELS_ENV
85+
expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_log_lag_bucket[1m])) by (le, instance)) > 5000
86+
level: critical
87+
annotations:
88+
description: 'cluster: ENV_LABELS_ENV, instance {{ $labels.instance }}, values: {{ $value }}'
89+
summary: TiKV raftstore log lag more than 5000
90+
value: '{{ $value }}'
91+
- alert: TiKV_async_request_snapshot_duration_seconds
92+
expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="snapshot"}[1m])) by (le, instance, type)) > 1
93+
for: 1m
94+
labels:
95+
env: ENV_LABELS_ENV
96+
expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="snapshot"}[1m])) by (le, instance, type)) > 1
97+
level: critical
98+
annotations:
99+
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
100+
summary: TiKV async request snapshot duration seconds more than 1s
101+
value: '{{ $value }}'
102+
- alert: TiKV_async_request_write_duration_seconds
103+
expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="write"}[1m])) by (le, instance, type)) > 1
104+
for: 1m
105+
labels:
106+
env: ENV_LABELS_ENV
107+
expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="write"}[1m])) by (le, instance, type)) > 1
108+
level: critical
109+
annotations:
110+
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
111+
summary: TiKV async request write duration seconds more than 1s
112+
value: '{{ $value }}'
113+
- alert: TiKV_coprocessor_request_wait_seconds
114+
expr: histogram_quantile(0.9999, sum(rate(tikv_coprocessor_request_wait_seconds_bucket{type="all"}[1m])) by (le, instance, req)) > 10
115+
for: 1m
116+
labels:
117+
env: ENV_LABELS_ENV
118+
expr: histogram_quantile(0.9999, sum(rate(tikv_coprocessor_request_wait_seconds_bucket{type="all"}[1m])) by (le, instance, req)) > 10
119+
level: critical
120+
annotations:
121+
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
122+
summary: TiKV coprocessor request wait seconds more than 10s
123+
value: '{{ $value }}'
124+
- alert: TiKV_raftstore_thread_cpu_seconds_total
125+
expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"(raftstore|rs)_.*"}[1m])) by (instance) > 1.6
126+
for: 1m
127+
labels:
128+
env: ENV_LABELS_ENV
129+
expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"(raftstore|rs)_.*"}[1m])) by (instance) > 1.6
130+
level: critical
131+
annotations:
132+
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
133+
summary: TiKV raftstore thread CPU seconds is high
134+
value: '{{ $value }}'
135+
- alert: TiKV_raft_append_log_duration_secs
136+
expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket[1m])) by (le, instance)) > 1
137+
for: 1m
138+
labels:
139+
env: ENV_LABELS_ENV
140+
expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket[1m])) by (le, instance)) > 1
141+
level: critical
142+
annotations:
143+
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
144+
summary: TiKV_raft_append_log_duration_secs
145+
value: '{{ $value }}'
146+
- alert: TiKV_raft_apply_log_duration_secs
147+
expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_log_duration_seconds_bucket[1m])) by (le, instance)) > 1
148+
for: 1m
149+
labels:
150+
env: ENV_LABELS_ENV
151+
expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_log_duration_seconds_bucket[1m])) by (le, instance)) > 1
152+
level: critical
153+
annotations:
154+
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
155+
summary: TiKV_raft_apply_log_duration_secs
156+
value: '{{ $value }}'
157+
- alert: TiKV_scheduler_latch_wait_duration_seconds
158+
expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_latch_wait_duration_seconds_bucket[1m])) by (le, instance, type)) > 1
159+
for: 1m
160+
labels:
161+
env: ENV_LABELS_ENV
162+
expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_latch_wait_duration_seconds_bucket[1m])) by (le, instance, type)) > 1
163+
level: critical
164+
annotations:
165+
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
166+
summary: TiKV scheduler latch wait duration seconds more than 1s
167+
value: '{{ $value }}'
168+
- alert: TiKV_thread_apply_worker_cpu_seconds
169+
expr: max(rate(tikv_thread_cpu_seconds_total{name="apply_.*"}[1m])) by (instance) > 0.9
170+
for: 1m
171+
labels:
172+
env: ENV_LABELS_ENV
173+
expr: max(rate(tikv_thread_cpu_seconds_total{name="apply_.*"}[1m])) by (instance) > 0.9
174+
level: critical
175+
annotations:
176+
description: 'cluster: ENV_LABELS_ENV, type: {{ $labels.type }}, instance: {{ $labels.instance }}, values: {{ $value }}'
177+
summary: TiKV thread apply worker cpu seconds is high
178+
value: '{{ $value }}'
179+
- alert: TiDB_tikvclient_gc_action_fail
180+
expr: sum(increase(tidb_tikvclient_gc_action_result{type="fail"}[1m])) > 10
181+
for: 1m
182+
labels:
183+
env: ENV_LABELS_ENV
184+
expr: sum(increase(tidb_tikvclient_gc_action_result{type="fail"}[1m])) > 10
185+
level: critical
186+
annotations:
187+
description: 'cluster: ENV_LABELS_ENV, type: {{ $labels.type }}, instance: {{ $labels.instance }}, values: {{ $value }}'
188+
summary: TiDB_tikvclient_gc_action_fail
189+
value: '{{ $value }}'
190+
- alert: TiKV_leader_drops
191+
expr: delta(tikv_pd_heartbeat_tick_total{type="leader"}[30s]) < -10
192+
for: 1m
193+
labels:
194+
env: ENV_LABELS_ENV
195+
expr: delta(tikv_pd_heartbeat_tick_total{type="leader"}[30s]) < -10
196+
level: warning
197+
annotations:
198+
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
199+
summary: TiKV leader drops
200+
value: '{{ $value }}'
201+
- alert: TiKV_raft_process_ready_duration_secs
202+
expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='ready'}[1m])) by (le, instance, type)) > 2
203+
for: 1m
204+
labels:
205+
env: ENV_LABELS_ENV
206+
expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='ready'}[1m])) by (le, instance, type)) > 2
207+
level: warning
208+
annotations:
209+
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
210+
summary: TiKV_raft_process_ready_duration_secs
211+
value: '{{ $value }}'
212+
- alert: TiKV_raft_process_tick_duration_secs
213+
expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='tick'}[1m])) by (le, instance, type)) > 2
214+
for: 1m
215+
labels:
216+
env: ENV_LABELS_ENV
217+
expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='tick'}[1m])) by (le, instance, type)) > 2
218+
level: warning
219+
annotations:
220+
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
221+
summary: TiKV_raft_process_tick_duration_secs
222+
value: '{{ $value }}'
223+
- alert: TiKV_scheduler_context_total
224+
expr: abs(delta( tikv_scheduler_contex_total[5m])) > 1000
225+
for: 1m
226+
labels:
227+
env: ENV_LABELS_ENV
228+
expr: abs(delta( tikv_scheduler_contex_total[5m])) > 1000
229+
level: warning
230+
annotations:
231+
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
232+
summary: TiKV scheduler context total
233+
value: '{{ $value }}'
234+
- alert: TiKV_scheduler_command_duration_seconds
235+
expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_command_duration_seconds_bucket[1m])) by (le, instance, type) / 1000) > 1
236+
for: 1m
237+
labels:
238+
env: ENV_LABELS_ENV
239+
expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_command_duration_seconds_bucket[1m])) by (le, instance, type) / 1000) > 1
240+
level: warning
241+
annotations:
242+
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
243+
summary: TiKV scheduler command duration seconds more than 1s
244+
value: '{{ $value }}'
245+
- alert: TiKV_coprocessor_pending_request
246+
expr: delta( tikv_coprocessor_pending_request[10m]) > 5000
247+
for: 1m
248+
labels:
249+
env: ENV_LABELS_ENV
250+
expr: delta( tikv_coprocessor_pending_request[10m]) > 5000
251+
level: warning
252+
annotations:
253+
description: 'cluster: ENV_LABELS_ENV, type: {{ $labels.type }}, instance: {{ $labels.instance }}, values: {{ $value }}'
254+
summary: TiKV pending {{ $labels.type }} request is high
255+
value: '{{ $value }}'
256+
- alert: TiKV_coprocessor_cpu_util
257+
expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"cop_.*"}[1m])) by (instance) / (count(tikv_thread_cpu_seconds_total{name=~"cop_.*"}) by (instance) * 0.9) >= 1
258+
for: 1m
259+
labels:
260+
env: ENV_LABELS_ENV
261+
expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"cop_.*"}[1m])) by (instance) / (count(tikv_thread_cpu_seconds_total{name=~"cop_.*"}) by (instance) * 0.9) >= 1
262+
level: warning
263+
annotations:
264+
description: 'cluster: ENV_LABELS_ENV, type: {{ $labels.type }}, instance: {{ $labels.instance }}, values: {{ $value }}'
265+
summary: TiKV coprocessor CPU utilization exceeds 90%
266+
value: '{{ $value }}'
267+
- alert: TiKV_pending_task
268+
expr: sum(tikv_worker_pending_task_total) BY (instance,name) > 1000
269+
for: 1m
270+
labels:
271+
env: ENV_LABELS_ENV
272+
expr: sum(tikv_worker_pending_task_total) BY (instance,name) > 1000
273+
level: warning
274+
annotations:
275+
description: 'cluster: ENV_LABELS_ENV, type: {{ $labels.type }}, instance: {{ $labels.instance }}, values: {{ $value }}'
276+
summary: TiKV pending task too much
277+
value: '{{ $value }}'
278+
- alert: TiKV_low_space
279+
expr: sum(tikv_store_size_bytes{type="available"}) by (instance) / sum(tikv_store_size_bytes{type="capacity"}) by (instance) < 0.2
280+
for: 1m
281+
labels:
282+
env: ENV_LABELS_ENV
283+
expr: sum(tikv_store_size_bytes{type="available"}) by (instance) / sum(tikv_store_size_bytes{type="capacity"}) by (instance) < 0.2
284+
level: warning
285+
annotations:
286+
description: 'cluster: ENV_LABELS_ENV, type: {{ $labels.type }}, instance: {{ $labels.instance }}, values: {{ $value }}'
287+
summary: TiKV available disk space too low
288+
value: '{{ $value }}'
289+
- alert: TiKV_approximate_region_size
290+
expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_region_size_bucket[1m])) by (le)) > 1073741824
291+
for: 1m
292+
labels:
293+
env: ENV_LABELS_ENV
294+
expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_region_size_bucket[1m])) by (le)) > 1073741824
295+
level: warning
296+
annotations:
297+
description: 'cluster: ENV_LABELS_ENV, type: {{ $labels.type }}, instance: {{ $labels.instance }}, values: {{ $value }}'
298+
summary: TiKV approximate region size is more than 1GB
299+
value: '{{ $value }}'
300+
- alert: TiKV_node_restart
301+
expr: changes(process_start_time_seconds{job="tikv"}[5m]) > 0
302+
for: 1m
303+
labels:
304+
env: ENV_LABELS_ENV
305+
expr: changes(process_start_time_seconds{job="tikv"}[5m]) > 0
306+
level: warning
307+
annotations:
308+
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
309+
summary: TiKV server has been restarted
310+
value: '{{ $value }}'
311+
- alert: TiKV_cpu_quota
312+
expr: irate(process_cpu_seconds_total{job="tikv"}[30s]) / tikv_server_cpu_cores_quota > 0.8
313+
for: 45s
314+
labels:
315+
env: ENV_LABELS_ENV
316+
expr: irate(process_cpu_seconds_total{job="tikv"}[30s]) / tikv_server_cpu_cores_quota > 0.8
317+
level: warning
318+
annotations:
319+
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
320+
summary: TiKV CPU usage is over 80% of CPU quota
321+
value: '{{ $value }}'
322+
- alert: TiKV_memory_quota
323+
expr: process_resident_memory_bytes{job="tikv"} / tikv_server_memory_quota_bytes > 0.8
324+
for: 15s
325+
labels:
326+
env: ENV_LABELS_ENV
327+
expr: process_resident_memory_bytes{job="tikv"} / tikv_server_memory_quota_bytes > 0.8
328+
level: warning
329+
annotations:
330+
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
331+
summary: TiKV memory usage is over 80% of memory quota
332+
value: '{{ $value }}'

0 commit comments

Comments
 (0)