Skip to content

Commit eeef2b4

Browse files
committed
Initial export
0 parents  commit eeef2b4

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

55 files changed

+10110
-0
lines changed

.gitignore

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
brubeck
2+
*.o
3+
*.swp
4+
*~
5+
.DS_Store
6+
build/*

.gitmodules

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
[submodule "vendor/ck"]
2+
path = vendor/ck
3+
url = https://github.com/concurrencykit/ck

LICENSE

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
Copyright (c) 2012-2015 GitHub, Inc.
2+
3+
Permission is hereby granted, free of charge, to any person
4+
obtaining a copy of this software and associated documentation
5+
files (the "Software"), to deal in the Software without
6+
restriction, including without limitation the rights to use,
7+
copy, modify, merge, publish, distribute, sublicense, and/or sell
8+
copies of the Software, and to permit persons to whom the
9+
Software is furnished to do so, subject to the following
10+
conditions:
11+
12+
The above copyright notice and this permission notice shall be
13+
included in all copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17+
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
19+
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
20+
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21+
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22+
OTHER DEALINGS IN THE SOFTWARE.

Makefile

+63
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
GIT_SHA = $(shell git rev-parse --short HEAD)
2+
TARGET = brubeck
3+
LIBS = -lm -lpthread -lrt -lcrypto -ljansson
4+
CC = gcc
5+
CXX = g++
6+
CFLAGS = -g -Wall -O3 -Wno-strict-aliasing -Isrc -Ivendor/ck/include -DNDEBUG=1 -DGIT_SHA=\"$(GIT_SHA)\"
7+
8+
.PHONY: default all clean
9+
10+
default: $(TARGET)
11+
all: default
12+
13+
SOURCES = \
14+
src/backend.c \
15+
src/backends/carbon.c \
16+
src/bloom.c \
17+
src/city.c \
18+
src/histogram.c \
19+
src/ht.c \
20+
src/http.c \
21+
src/http/mongoose.c \
22+
src/internal_sampler.c \
23+
src/log.c \
24+
src/metric.c \
25+
src/sampler.c \
26+
src/samplers/statsd-secure.c \
27+
src/samplers/statsd.c \
28+
src/server.c \
29+
src/setproctitle.c \
30+
src/slab.c \
31+
src/utils.c
32+
33+
OBJECTS = $(patsubst %.c, %.o, $(SOURCES))
34+
HEADERS = $(wildcard src/*.h) $(wildcard src/libcuckoo/*.h)
35+
36+
TEST_SRC = $(wildcard tests/*.c)
37+
TEST_OBJ = $(patsubst %.c, %.o, $(TEST_SRC))
38+
39+
%.o: %.c $(HEADERS) vendor/ck/src/libck.a
40+
$(CC) $(CFLAGS) -c $< -o $@
41+
42+
.PRECIOUS: $(TARGET) $(OBJECTS)
43+
44+
$(TARGET): $(OBJECTS) brubeck.o
45+
$(CC) -flto brubeck.o $(OBJECTS) $(LIBS) vendor/ck/src/libck.a -o $@.new
46+
mv $@.new $@
47+
48+
$(TARGET)_test: $(OBJECTS) $(TEST_OBJ)
49+
$(CC) $(OBJECTS) $(TEST_OBJ) $(LIBS) vendor/ck/src/libck.a -o $@
50+
51+
test: $(TARGET)_test
52+
./$(TARGET)_test
53+
54+
vendor/ck/Makefile:
55+
cd vendor/ck && ./configure
56+
57+
vendor/ck/src/libck.a: vendor/ck/Makefile
58+
$(MAKE) -C vendor/ck
59+
60+
clean:
61+
-rm -f $(OBJECTS) brubeck.o
62+
-rm -f $(TEST_OBJ)
63+
-rm -f $(TARGET) $(TARGET)_test

README.md

+208
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
# Brubeck
2+
3+
Brubeck is a [statsd](https://github.com/etsy/statsd)-compatible stats
4+
aggregator written in C.
5+
6+
## What is statsd?
7+
8+
Statsd is a metrics aggregator for Graphite (and other data storage backends). This
9+
technical documentation assumes working knowledge of what statsd is and how it works;
10+
please read the [statsd documentation](https://github.com/etsy/statsd#statsd-) for
11+
more details.
12+
13+
Statsd is a good idea, and if you're using Graphite for metrics collection in your
14+
infrastructure, you probably want a statsd-compatible aggregator in front of it.
15+
16+
## Tradeoffs
17+
18+
- Brubeck is missing many of the features of the original StatsD. We've only implemented what we felt was necessary for our metrics stack.
19+
20+
- Brubeck only runs on Linux. It won't even build on Mac OS X.
21+
22+
- Some of the performance features require a (moderately) recent version of the kernel that you may not have.
23+
24+
## Building
25+
26+
Brubeck has the following dependencies:
27+
28+
- A Turing-complete computing device running a modern version of the Linux kernel
29+
(the kernel needs to be at least 2.6.33 in order to use multiple recvmsg support)
30+
31+
- A compiler for the C programming language
32+
33+
- Jansson (`libjansson-dev` on Debian) to load the configuration (version 2.5+ is required)
34+
35+
- OpenSSL (`libcrypto`) if you're building StatsD-Secure support
36+
37+
Build brubeck by typing:
38+
39+
./script/bootstrap
40+
41+
Other operating systems or kernels can probably build Brubeck too. More specifically,
42+
Brubeck has been seen to work under FreeBSD and OpenBSD, but this is not supported.
43+
44+
## Interfacing
45+
46+
The are several ways to interact with a running Brubeck daemon.
47+
48+
### Signals
49+
50+
Brubeck answers to the following signals:
51+
52+
- `SIGINT`, `SIGTERM`: shutdown cleanly
53+
- `SIGHUP`: reopen the log files (in case you're using logrotate or an equivalent)
54+
- `SIGUSR2`: dump a newline-separated list of all the metrics currently aggregated by the
55+
daemon and their types.
56+
57+
### HTTP Endpoint
58+
59+
If enabled on the config file, Brubeck can provide an HTTP API to poll its status. The following routes are available:
60+
61+
- `GET /ping`: return a short JSON payload with the current status of the daemon (just to check it's up)
62+
- `GET /stats`: get a large JSON payload with full statistics, including active endpoints and throughputs
63+
- `GET /metric/{{metric_name}}`: get the current status of a metric, if it's being aggregated
64+
- `POST /expire/{{metric_name}}`: expire a metric that is no longer being reported to stop it from being aggregated to the backend
65+
66+
## Configuration
67+
68+
The configuration for Brubeck is loaded through a JSON file, passed on the commandline.
69+
70+
./brubeck --config=my.config.json
71+
72+
If no configuration file is passed to the daemon, it will load `config.default.json`, which
73+
contains useful defaults for local development/testing.
74+
75+
The JSON file can contain the following sections:
76+
77+
- `server_name`: a string identifying the name for this specific Brubeck instance. This will
78+
be used by the daemon when reporting its internal metrics.
79+
80+
- `dumpfile`: a path where to store the metrics list when triggering a dump (see the section on
81+
Interfacing with the daemon)
82+
83+
- `http`: if existing, this string sets the listen address and port for the HTTP API
84+
85+
- `backends`: an array of the different backends to load. If more than one backend is loaded,
86+
brubeck will function in sharding mode, distributing aggregation load evenly through all
87+
the different backends through constant-hashing.
88+
89+
- `carbon`: a backend that aggregates data into a Carbon cache. The backend sends all the
90+
aggregated data once every `frequency` seconds. By default the data is sent to the port 2003
91+
of the Carbon cache (plain text protocol), but the pickle wire protocol can be enabled by
92+
setting `pickle` to `true` and changing the port accordingly.
93+
94+
```
95+
{
96+
"type" : "carbon",
97+
"address" : "0.0.0.0",
98+
"port" : 2003,
99+
"frequency" : 10,
100+
"pickle: true
101+
}
102+
```
103+
104+
We strongly encourage you to use the pickle wire protocol instead of plaintext,
105+
because carbon-relay.py is not very performant and will choke when parsing plaintext
106+
under enough load. Pickles are much softer CPU-wise on the Carbon relays,
107+
aggregators and caches.
108+
109+
Hmmmm pickles. Now I'm hungry. Lincoln when's lunch?
110+
111+
- `samplers`: an array of the different samplers to load. Samplers run on paralel and gather
112+
incoming metrics from the network.
113+
114+
- `statsd`: the default statsd-compatible sampler. It listens on an UDP port for metrics
115+
packets. You can have more than one statsd sampler on the same daemon, but Brubeck was
116+
designed to support a single sampler taking the full metrics load on a single port.
117+
118+
```
119+
{
120+
"type" : "statsd",
121+
"address" : "0.0.0.0",
122+
"port" : 8126,
123+
}
124+
```
125+
126+
The StatsD sampler has the following options (and default values) for performance tuning:
127+
128+
- `"workers" : 4` number of worker threads that will service the StatsD socket endpoint. More threads means emptying the socket faster, but the context switching and cache smashing will affect performance. In general, you can saturate your NIC as long as you have enough worker threads (one per core) and a fast enough CPU. Set this to 1 if you want to run the daemon in event-loop mode. But that'd be silly. This is not Node.
129+
130+
- `"multisock" : false` if set to true, Brubeck will use the `SO_REUSEPORT` flag available since Linux 3.9 to create one socket per worker thread and bind it to the same address/port. The kernel will then round-robin between the threads without forcing them to race for the socket. This improves performance by up to 30%, try benchmarking this if your Kernel is recent enough.
131+
132+
- `"multimsg" : 1` if set to greater than one, Brubeck will use the `recvmmsg` syscall (available since Linux 2.6.33) to read several UDP packets (the specified amount) in a single call and reduce the amount of context switches. This doesn't improve performance much with several worker threads, but may have an effect in a limited configuration with only one thread. Make it a power of two for better results. As always, benchmark. YMMV.
133+
134+
- `statsd-secure`: like StatsD, but each packet has a HMAC that verifies its integrity. This is hella useful if you're running infrastructure in The Cloud (TM) (C) and you want to send back packets back to your VPN without them being tampered by third parties.
135+
136+
```
137+
{
138+
"type" : "statsd-secure",
139+
"address" : "0.0.0.0",
140+
"port" : 9126,
141+
"max_drift" : 3,
142+
"hmac_key" : "750c783e6ab0b503eaa86e310a5db738",
143+
"replay_len" : 8000
144+
}
145+
```
146+
147+
The `address` and `port` parts are obviously the same as in statsd.
148+
149+
- `max_drift` defines the maximum time (in seconds) that packets can be delayed
150+
since they were sent from the origin. All metrics come with a timestamp, so metrics
151+
that drift more than this value will silently be discared.
152+
153+
- `hmac_key` is the shared HMAC secret. The client sending the metrics must also know
154+
this in order to sign them.
155+
156+
- `replay_len` is the size of the bloom filter that will be used to prevent replay
157+
attacks. We use a rolling bloom filter (one for every drift second), so `replay_len`
158+
should roughly be the amount of **unique** metrics you expect to receive in a 1s
159+
interval.
160+
161+
**NOTE**: StatsD-secure doesn't run with multiple worker threads because verifying
162+
signatures is already slow enough. Don't use this in performance critical scenarios.
163+
164+
**NOTE**: StatsD-secure uses a bloom filter to prevent replay attacks, so a small
165+
percentage of metrics *will* be dropped because of false positives. Take this into
166+
consideration.
167+
168+
**NOTE**: An HMAC does *not* encrypt the packets, it just verifies its integrity.
169+
If you need to protect the content of the packets from eavesdropping, get those
170+
external machines in your VPN.
171+
172+
**NOTE**: StatsD-secure may or may not be a good idea. If you have the chance to
173+
send all your metrics inside a VPN, I suggest you do that instead.
174+
175+
## Testing
176+
177+
There's some tests in the `test` folder for key parts of the system (such as packet parsing,
178+
and all concurrent data access); besides that we test the behavior of the daemon live on staging
179+
and production systems.
180+
181+
- Small changes are deployed into production as-is, straight from their feature branch.
182+
Deployment happens in 3 seconds for all the Brubeck instances in our infrastructure, so
183+
we can roll back into the master branch immediately if something fails.
184+
185+
- For critical changes, we multiplex a copy of the metrics stream into an Unix domain socket,
186+
so we can have two instances of the daemon (old and new) aggregating to the production
187+
cluster and a staging cluster, and verify that the metrics flow into the two clusters is equivalent.
188+
189+
- Benchmarking is performed on real hardware in our datacenter. The daemon is spammed with fake
190+
metrics across the network and we ensure that there are no regressions (particularly in the linear
191+
scaling between cores for the statsd sampler).
192+
193+
When in doubt, please refer to the part of the MIT license that says *"THE SOFTWARE IS PROVIDED
194+
'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED"*. We use Brubeck in production and
195+
have been doing so for years, but we cannot make any promises regarding availability or
196+
performance.
197+
198+
## FAQ
199+
200+
- **I cannot hit 4 million UDP metrics per second. I want my money back.**
201+
202+
Make sure receiver-side scaling is properly configured in your kernel and that IRQs
203+
are being serviced by different cores, and that the daemon's threads are not
204+
pinned to a specific core. Make sure you're running the daemon in a physical machine
205+
and not a cheap cloud VPS. Make sure your NIC has the right drivers and it's not
206+
bottlenecking. Install a newer kernel and try running with `SO_REUSEPORT`.
207+
208+
If nothing works, refunds are available upon request. Just get mad at me on Twitter.

brubeck.c

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#include "brubeck.h"
2+
#include "getopt.h"
3+
4+
int main(int argc, char *argv[])
5+
{
6+
static struct option longopts[] = {
7+
{ "log", required_argument, NULL, 'l' },
8+
{ "config", required_argument, NULL, 'c' },
9+
{ "version", no_argument, NULL, 'v' },
10+
{ NULL, 0, NULL, 0 }
11+
};
12+
13+
struct brubeck_server _server;
14+
const char *config_file = "config.default.json";
15+
const char *log_file = NULL;
16+
int opt;
17+
18+
while ((opt = getopt_long(argc, argv, ":l:c:v", longopts, NULL)) != -1) {
19+
switch (opt) {
20+
case 'l': log_file = optarg; break;
21+
case 'c': config_file = optarg; break;
22+
case 'v':
23+
puts("brubeck " GIT_SHA);
24+
return 0;
25+
26+
default:
27+
printf("Usage: %s [--log LOG_FILE] [--config CONFIG_FILE] [--version]", argv[0]);
28+
return 1;
29+
}
30+
}
31+
32+
initproctitle(argc, argv);
33+
gh_log_open(log_file);
34+
brubeck_server_init(&_server, config_file);
35+
return brubeck_server_run(&_server);
36+
}

config.default.json.example

+37
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
{
2+
"logfile" : "./log/brubeck.log",
3+
"sharding" : false,
4+
"server_name" : "brubeck_debug",
5+
"dumpfile" : "./brubeck.dump",
6+
"capacity" : 15,
7+
"expire" : 5,
8+
"http" : "127.0.0.1:8080",
9+
10+
"backends" : [
11+
{
12+
"type" : "carbon",
13+
"address" : "0.0.0.0",
14+
"port" : 2003,
15+
"frequency" : 10
16+
}
17+
],
18+
19+
"samplers" : [
20+
{
21+
"type" : "statsd",
22+
"address" : "0.0.0.0",
23+
"port" : 8126,
24+
"workers" : 4,
25+
"multisock" : true,
26+
"multimsg" : 8
27+
},
28+
{
29+
"type" : "statsd-secure",
30+
"address" : "0.0.0.0",
31+
"port" : 9126,
32+
"max_drift" : 3,
33+
"hmac_key" : "750c783e6ab0b503eaa86e310a5db738",
34+
"replay_len" : 8000
35+
}
36+
]
37+
}

script/bootstrap

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
#!/bin/sh
2+
3+
ROOT=$(dirname "$0")/..
4+
cd "$ROOT" && git submodule update --init && make brubeck

0 commit comments

Comments
 (0)