Skip to content

Commit 92b60c2

Browse files
authored
Merge pull request #703 from ydb-platform/topic_slo
Topic workload SLO
2 parents 2d1ff8d + dd8ff1d commit 92b60c2

23 files changed

+1535
-548
lines changed

.github/workflows/slo.yml

Lines changed: 51 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,54 @@ jobs:
3030

3131
strategy:
3232
matrix:
33-
workload:
34-
- sync-table
35-
- sync-query
33+
include:
34+
- prefix: table
35+
workload: sync-table
36+
create-args: grpc://localhost:2135 /Root/testdb
37+
run-args: |
38+
grpc://localhost:2135 /Root/testdb \
39+
--prom-pgw localhost:9091 \
40+
--report-period 250 \
41+
--time ${{inputs.slo_workload_duration_seconds || 600}} \
42+
--read-rps ${{inputs.slo_workload_read_max_rps || 1000}} \
43+
--write-rps ${{inputs.slo_workload_write_max_rps || 100}} \
44+
--read-timeout 1000 \
45+
--write-timeout 1000
46+
cleanup-args: grpc://localhost:2135 /Root/testdb
47+
- prefix: table
48+
workload: sync-query
49+
create-args: grpc://localhost:2135 /Root/testdb
50+
run-args: |
51+
grpc://localhost:2135 /Root/testdb \
52+
--prom-pgw localhost:9091 \
53+
--report-period 250 \
54+
--time ${{inputs.slo_workload_duration_seconds || 600}} \
55+
--read-rps ${{inputs.slo_workload_read_max_rps || 1000}} \
56+
--write-rps ${{inputs.slo_workload_write_max_rps || 100}} \
57+
--read-timeout 1000 \
58+
--write-timeout 1000
59+
cleanup-args: grpc://localhost:2135 /Root/testdb
60+
# - prefix: topic
61+
# workload: topic-basic
62+
# create-args: |
63+
# grpc://localhost:2135 /Root/testdb \
64+
# --path /Root/testdb/slo_topic \
65+
# --partitions-count 10
66+
# run-args: |
67+
# grpc://localhost:2135 /Root/testdb \
68+
# --path /Root/testdb/slo_topic \
69+
# --prom-pgw localhost:9091 \
70+
# --partitions-count 10 \
71+
# --read-threads 10 \
72+
# --write-threads 10 \
73+
# --report-period 250 \
74+
# --time ${{inputs.slo_workload_duration_seconds || 600}} \
75+
# --read-rps ${{inputs.slo_workload_read_max_rps || 100}} \
76+
# --write-rps ${{inputs.slo_workload_write_max_rps || 100}} \
77+
# --read-timeout 5000 \
78+
# --write-timeout 5000
79+
# cleanup-args: grpc://localhost:2135 /Root/testdb --path /Root/testdb/slo_topic
80+
3681

3782
concurrency:
3883
group: slo-${{ github.ref }}-${{ matrix.workload }}
@@ -64,26 +109,19 @@ jobs:
64109

65110
- name: Prepare SLO Database
66111
run: |
67-
python ./tests/slo/src create grpc://localhost:2135 /Root/testdb
112+
python ./tests/slo/src ${{ matrix.prefix }}-create ${{ matrix.create-args }}
68113
69114
- name: Run SLO Tests
70115
env:
71116
REF: '${{ github.head_ref || github.ref }}'
72117
WORKLOAD: '${{ matrix.workload }}'
73118
run: |
74-
python ./tests/slo/src run grpc://localhost:2135 /Root/testdb \
75-
--prom-pgw localhost:9091 \
76-
--report-period 250 \
77-
--time ${{inputs.slo_workload_duration_seconds || 600}} \
78-
--read-rps ${{inputs.slo_workload_read_max_rps || 1000}} \
79-
--write-rps ${{inputs.slo_workload_write_max_rps || 100}} \
80-
--read-timeout 1000 \
81-
--write-timeout 1000
119+
python ./tests/slo/src ${{ matrix.prefix }}-run ${{ matrix.run-args }}
82120
83121
- if: always()
84122
name: Cleanup SLO Database
85123
run: |
86-
python ./tests/slo/src cleanup grpc://localhost:2135 /Root/testdb
124+
python ./tests/slo/src ${{ matrix.prefix }}-cleanup ${{ matrix.cleanup-args }}
87125
88126
- if: always()
89127
name: Store ydb chaos testing logs

tests/slo/README.md

Lines changed: 122 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -3,42 +3,72 @@
33
SLO is the type of test where app based on ydb-sdk is tested against falling YDB cluster nodes, tablets, network
44
(that is possible situations for distributed DBs with hundreds of nodes)
55

6-
### Implementations:
6+
### Workload types:
7+
8+
There are two workload types:
9+
10+
- **Table SLO** - tests table operations (read/write)
11+
- **Topic SLO** - tests topic operations (publish/consume)
712

8-
There are two implementations:
13+
### Implementations:
914

1015
- `sync`
1116
- `async` (now unimplemented)
1217

1318
### Usage:
1419

15-
It has 3 commands:
20+
Each workload type has 3 commands:
21+
22+
**Table commands:**
23+
- `table-create` - creates table in database
24+
- `table-cleanup` - drops table in database
25+
- `table-run` - runs table workload (read and write to table with set RPS)
1626

17-
- `create` - creates table in database
18-
- `cleanup` - drops table in database
19-
- `run` - runs workload (read and write to table with sets RPS)
27+
**Topic commands:**
28+
- `topic-create` - creates topic with consumer in database
29+
- `topic-cleanup` - drops topic in database
30+
- `topic-run` - runs topic workload (publish and consume messages with set RPS)
2031

2132
### Run examples with all arguments:
2233

23-
create:
24-
`python tests/slo/src/ create localhost:2136 /local -t tableName
34+
**Table examples:**
35+
36+
table-create:
37+
`python tests/slo/src/ table-create localhost:2136 /local -t tableName
2538
--min-partitions-count 6 --max-partitions-count 1000 --partition-size 1 -с 1000
2639
--write-timeout 10000`
2740

28-
cleanup:
29-
`python tests/slo/src/ cleanup localhost:2136 /local -t tableName`
41+
table-cleanup:
42+
`python tests/slo/src/ table-cleanup localhost:2136 /local -t tableName`
3043

31-
run:
32-
`python tests/slo/src/ run localhost:2136 /local -t tableName
33-
--prom-pgw http://prometheus-pushgateway:9091 -report-period 250
44+
table-run:
45+
`python tests/slo/src/ table-run localhost:2136 /local -t tableName
46+
--prom-pgw http://prometheus-pushgateway:9091 --report-period 250
3447
--read-rps 1000 --read-timeout 10000
3548
--write-rps 100 --write-timeout 10000
3649
--time 600 --shutdown-time 30`
3750

51+
**Topic examples:**
52+
53+
topic-create:
54+
`python tests/slo/src/ topic-create localhost:2136 /local
55+
--topic-path /local/slo_topic --topic-consumer slo_consumer`
56+
57+
topic-cleanup:
58+
`python tests/slo/src/ topic-cleanup localhost:2136 /local --topic-path /local/slo_topic`
59+
60+
topic-run:
61+
`python tests/slo/src/ topic-run localhost:2136 /local
62+
--topic-path /local/slo_topic --topic-consumer slo_consumer
63+
--prom-pgw http://prometheus-pushgateway:9091 --report-period 250
64+
--topic-write-rps 50 --topic-read-rps 100
65+
--topic-write-timeout 5000 --topic-read-timeout 3000
66+
--time 600 --shutdown-time 30`
67+
3868
## Arguments for commands:
3969

40-
### create
41-
`python tests/slo/src/ create <endpoint> <db> [options]`
70+
### table-create
71+
`python tests/slo/src/ table-create <endpoint> <db> [options]`
4272

4373
```
4474
Arguments:
@@ -61,8 +91,8 @@ Options:
6191
6292
```
6393

64-
### cleanup
65-
`python tests/slo/src/ cleanup <endpoint> <db> [options]`
94+
### table-cleanup
95+
`python tests/slo/src/ table-cleanup <endpoint> <db> [options]`
6696

6797
```
6898
Arguments:
@@ -73,8 +103,8 @@ Options:
73103
-t --table-name <string> table name to create
74104
```
75105

76-
### run
77-
`python tests/slo/src/ run <endpoint> <db> [options]`
106+
### table-run
107+
`python tests/slo/src/ table-run <endpoint> <db> [options]`
78108

79109
```
80110
Arguments:
@@ -100,12 +130,70 @@ Options:
100130
--write-threads <int> number of threads to use for read requests
101131
```
102132

133+
### topic-create
134+
`python tests/slo/src/ topic-create <endpoint> <db> [options]`
135+
136+
```
137+
Arguments:
138+
endpoint YDB endpoint to connect to
139+
db YDB database to connect to
140+
141+
Options:
142+
--topic-path <string> topic path to create
143+
--topic-consumer <string> consumer name
144+
--topic-min-partitions <int> minimum active partitions
145+
--topic-max-partitions <int> maximum active partitions
146+
--topic-retention-hours <int> retention period in hours
147+
```
148+
149+
### topic-cleanup
150+
`python tests/slo/src/ topic-cleanup <endpoint> <db> [options]`
151+
152+
```
153+
Arguments:
154+
endpoint YDB endpoint to connect to
155+
db YDB database to connect to
156+
157+
Options:
158+
--topic-path <string> topic path to drop
159+
```
160+
161+
### topic-run
162+
`python tests/slo/src/ topic-run <endpoint> <db> [options]`
163+
164+
```
165+
Arguments:
166+
endpoint YDB endpoint to connect to
167+
db YDB database to connect to
168+
169+
Options:
170+
--topic-path <string> topic path
171+
--topic-consumer <string> consumer name
172+
173+
--prom-pgw <string> prometheus push gateway
174+
--report-period <int> prometheus push period in milliseconds
175+
176+
--topic-read-rps <int> read RPS for topics
177+
--topic-read-timeout <int> read timeout milliseconds for topics
178+
--topic-write-rps <int> write RPS for topics
179+
--topic-write-timeout <int> write timeout milliseconds for topics
180+
181+
--topic-message-size <int> message size in bytes
182+
--topic-read-threads <int> number of threads to use for read requests
183+
--topic-write-threads <int> number of threads to use for write requests
184+
185+
--time <int> run time in seconds
186+
--shutdown-time <int> graceful shutdown time in seconds
187+
```
188+
103189
## Authentication
104190

105191
Workload using [auth-env](https://ydb.yandex-team.ru/docs/reference/ydb-sdk/recipes/auth-env) for authentication.
106192

107193
## What's inside
108-
When running `run` command, the program creates three jobs: `readJob`, `writeJob`, `metricsJob`.
194+
195+
### Table workload
196+
When running `table-run` command, the program creates three jobs: `readJob`, `writeJob`, `metricsJob`.
109197

110198
- `readJob` reads rows from the table one by one with random identifiers generated by writeJob
111199
- `writeJob` generates and inserts rows
@@ -120,13 +208,27 @@ Table have these fields:
120208

121209
Primary key: `("object_hash", "object_id")`
122210

211+
### Topic workload
212+
When running `topic-run` command, the program creates three jobs: `readJob`, `writeJob`, `metricsJob`.
213+
214+
- `readJob` reads messages from topic using TopicReader and commits offsets
215+
- `writeJob` generates and publishes messages to topic using TopicWriter
216+
- `metricsJob` periodically sends metrics to Prometheus
217+
218+
Messages contain:
219+
- Sequential message ID
220+
- Thread identifier
221+
- Configurable payload size (padded with 'x' characters)
222+
123223
## Collected metrics
124224
- `oks` - amount of OK requests
125225
- `not_oks` - amount of not OK requests
126226
- `inflight` - amount of requests in flight
127227
- `latency` - summary of latencies in ms
128228
- `attempts` - summary of amount for request
129229

230+
Metrics are collected for both table operations (`read`, `write`) and topic operations (`read`, `write`).
231+
130232
> You must reset metrics to keep them `0` in prometheus and grafana before beginning and after ending of jobs
131233
132234
## Look at metrics in grafana

tests/slo/playground/README.md

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# SLO playground
2+
3+
Playground may be used for testing SLO workloads locally
4+
5+
It has several services:
6+
7+
- `prometheus` - storage for metrics
8+
- `prometheus-pushgateway` - push acceptor for prometheus
9+
- `grafana` - provides chats for metrics
10+
- `ydb` - local instance of ydb-database to run workload with
11+
12+
## Network addresses
13+
14+
- Grafana dashboard: http://localhost:3000
15+
- Prometheus pushgateway: http://localhost:9091
16+
- YDB monitoring: http://localhost:8765
17+
- YDB GRPC: grpc://localhost:2136
18+
- YDB GRPC TLS: grpcs://localhost:2135
19+
20+
## Start
21+
22+
```shell
23+
docker-compose up -d
24+
```
25+
26+
## Stop
27+
28+
```shell
29+
docker-compose down
30+
```
31+
32+
## Configs
33+
34+
Grafana's dashboards stored in `configs/grafana/provisioning/dashboards`
35+
36+
## Data
37+
38+
YDB databases are not persistent
39+
40+
All other data like metrics and certs stored in `data/`
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
#!/bin/sh -e
2+
3+
get_random_container() {
4+
# Get a list of all containers starting with ydb-database-*
5+
containers=$(docker ps --format '{{.Names}}' | grep '^ydb-database-')
6+
7+
# Convert the list to a newline-separated string
8+
containers=$(echo "$containers" | tr ' ' '\n')
9+
10+
# Count the number of containers
11+
containersCount=$(echo "$containers" | wc -l)
12+
13+
# Generate a random number between 0 and containersCount - 1
14+
randomIndex=$(shuf -i 0-$(($containersCount - 1)) -n 1)
15+
16+
# Get the container name at the random index
17+
nodeForChaos=$(echo "$containers" | sed -n "$(($randomIndex + 1))p")
18+
}
19+
20+
21+
sleep 20
22+
23+
echo "Start CHAOS YDB cluster!"
24+
25+
for i in $(seq 1 1000)
26+
do
27+
echo "[$(date)]: docker stop/start iteration $i"
28+
29+
get_random_container
30+
31+
sh -c "docker stop ${nodeForChaos} -t 10"
32+
sh -c "docker start ${nodeForChaos}"
33+
34+
sleep 60
35+
done
36+
37+
# for i in $(seq 1 3)
38+
# do
39+
# echo "[$(date)]: docker restart iteration $i"
40+
41+
# get_random_container
42+
43+
# sh -c "docker restart ${nodeForChaos} -t 0"
44+
45+
# sleep 60
46+
# done
47+
48+
# get_random_container
49+
50+
# echo "[$(date)]: docker kill -s SIGKILL ${nodeForChaos}"
51+
52+
# sh -c "docker kill -s SIGKILL ${nodeForChaos}"

0 commit comments

Comments
 (0)