Skip to content

Commit 50e24f6

Browse files
authored
Add more useful metrics to otel collector (#1546)
1 parent 46b0925 commit 50e24f6

File tree

5 files changed

+153
-36
lines changed

5 files changed

+153
-36
lines changed

.editorconfig

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
[*.yaml]
2+
indent_size = 2
3+
indent_style = space

iac/provider-gcp/.terraform.lock.hcl

Lines changed: 6 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

iac/provider-gcp/nomad-cluster-disk-image/.terraform.lock.hcl

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

iac/provider-gcp/nomad/configs/otel-collector.yaml

Lines changed: 142 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -21,56 +21,76 @@ receivers:
2121
hostmetrics:
2222
root_path: /hostfs
2323
collection_interval: 30s
24+
25+
# all scrapers are added explicitly, so we can be clear
26+
# about which metrics we've chosen to include/exclude.
27+
# https://github.com/open-telemetry/opentelemetry-collector-contrib/blob/main/receiver/hostmetricsreceiver/README.md
2428
scrapers:
2529
cpu:
2630
metrics:
27-
system.cpu.time:
28-
enabled: false
29-
system.cpu.utilization:
31+
system.cpu.time: # counter of seconds spent
32+
enabled: true
33+
system.cpu.frequency:
3034
enabled: false
3135
system.cpu.logical.count:
3236
enabled: true
3337
system.cpu.physical.count:
3438
enabled: true
39+
system.cpu.utilization: # percentage utilized in the moment
40+
enabled: false
41+
42+
# entirely disabled, so we omit the module
43+
#disk:
44+
# metrics:
45+
# system.disk.io:
46+
# enabled: false
47+
# system.disk.io_time:
48+
# enabled: false
49+
# system.disk.merged:
50+
# enabled: false
51+
# system.disk.operation_time:
52+
# enabled: false
53+
# system.disk.operations:
54+
# enabled: false
55+
# system.disk.pending_operations:
56+
# enabled: false
57+
# system.disk.weighted_io_time:
58+
# enabled: false
3559

3660
load:
3761
metrics:
62+
system.cpu.load_average.15m:
63+
enabled: true
3864
system.cpu.load_average.1m:
3965
enabled: true
4066
system.cpu.load_average.5m:
4167
enabled: true
42-
system.cpu.load_average.15m:
43-
enabled: false
4468

45-
# Scraping traffic going in/out of network interfaces
46-
network:
69+
# filter not needed stuff (mode, type)
70+
filesystem:
4771
metrics:
48-
system.network.connections:
49-
enabled: false
50-
system.network.dropped:
51-
enabled: false
52-
system.network.errors:
53-
enabled: false
54-
system.network.io:
55-
enabled: true
56-
system.network.packets:
57-
enabled: false
72+
system.filesystem.inodes.usage:
73+
enabled: true
74+
system.filesystem.usage:
75+
enabled: true
76+
system.filesystem.utilization:
77+
enabled: true
5878

5979
# Scraping only total memory (memory limit) and available memory to reduce cardinality
6080
memory:
6181
metrics:
62-
system.linux.memory.dirty:
63-
enabled: false
82+
system.memory.usage:
83+
enabled: true
6484
system.linux.memory.available: # free memory
6585
enabled: true
86+
system.linux.memory.dirty: # data waiting to be written to disk
87+
enabled: true
6688
system.memory.limit: # total memory
6789
enabled: true
6890
system.memory.page_size:
6991
enabled: false
70-
system.memory.usage:
71-
enabled: false
7292
system.memory.utilization:
73-
enabled: false
93+
enabled: true
7494

7595
# Huge pages metrics added as part of temporal patch
7696
# https://github.com/e2b-dev/opentelemetry-collector-contrib/pull/1
@@ -85,14 +105,80 @@ receivers:
85105
system.linux.memory.huge_pages.total:
86106
enabled: true
87107

88-
# filter not needed stuff (mode, type)
89-
# filter /dev/loop*, maybe check fo other that are polluting
90-
filesystem:
108+
# Scraping traffic going in/out of network interfaces
109+
network:
91110
metrics:
92-
system.filesystem.inodes.usage:
111+
system.network.connections:
112+
enabled: true
113+
system.network.dropped:
114+
enabled: true
115+
system.network.errors:
116+
enabled: true
117+
system.network.io:
118+
enabled: true
119+
system.network.packets:
93120
enabled: false
94-
system.filesystem.usage:
121+
system.network.conntrack.count:
122+
enabled: true
123+
system.network.conntrack.max:
124+
enabled: true
125+
126+
# entirely disabled, so we omit the module
127+
#paging:
128+
# metrics:
129+
# system.paging.faults:
130+
# enabled: false
131+
# system.paging.operations:
132+
# enabled: false
133+
# system.paging.usage:
134+
# enabled: false
135+
# system.paging.utilization:
136+
# enabled: false
137+
138+
processes:
139+
metrics:
140+
system.processes.count:
95141
enabled: true
142+
system.processes.created:
143+
enabled: true
144+
145+
# entirely disabled, so we omit the module
146+
#process:
147+
# metrics:
148+
# process.cpu.time:
149+
# enabled: false
150+
# process.disk.io:
151+
# enabled: false
152+
# process.memory.usage:
153+
# enabled: false
154+
# process.memory.virtual:
155+
# enabled: false
156+
# process.context_switches:
157+
# enabled: false
158+
# process.cpu.utilization:
159+
# enabled: false
160+
# process.disk.operations:
161+
# enabled: false
162+
# process.handles:
163+
# enabled: false
164+
# process.memory.utilization:
165+
# enabled: false
166+
# process.open_file_descriptors:
167+
# enabled: false
168+
# process.paging.faults:
169+
# enabled: false
170+
# process.signals_pending:
171+
# enabled: false
172+
# process.threads:
173+
# enabled: false
174+
# process.uptime:
175+
# enabled: false
176+
177+
# entirely disabled, so we omit the module
178+
#system:
179+
# metrics:
180+
# system.uptime:
181+
# enabled: false
96182

97183
processors:
98184
batch:
@@ -107,14 +193,15 @@ processors:
107193
metrics:
108194
datapoint:
109195
# Drop system.network.io for veth-* or docker* interfaces
110-
- 'metric.name == "system.network.io" and IsMatch(attributes["device"], "^(veth-.*|docker.*|lo)$")'
111-
# Drop system.filesystem.usage for loop devices
112-
- 'metric.name == "system.filesystem.usage" and IsMatch(attributes["device"], "^/dev/loop.*$")'
196+
- 'IsMatch(metric.name, "system.network..*") and IsMatch(attributes["device"], "^(veth.*|docker.*|lo)$")'
197+
# Drop anything related to loop devices
198+
- 'IsMatch(attributes["device"], "^/dev/loop.*$")'
113199

114200
attributes/strip_fs_labels:
115201
include:
116202
match_type: strict
117-
metric_names: [system.filesystem.usage]
203+
metric_names:
204+
- system.filesystem.usage
118205
actions:
119206
- action: delete
120207
key: mode
@@ -143,7 +230,6 @@ processors:
143230
- "vault.*"
144231
- "client_proxy.*"
145232
- "Click*"
146-
- "otelcol.*"
147233
- "pgxpool.*"
148234

149235
filter/prometheus:
@@ -180,6 +266,18 @@ processors:
180266
aggregation_type: sum
181267
label_set: [instance, node_id, node_status, node_pool]
182268

269+
metricstransform/single_cpu:
270+
transforms:
271+
- include: "system.cpu.time"
272+
match_type: strict
273+
action: update
274+
operations:
275+
- action: aggregate_labels
276+
label_set:
277+
- node.id
278+
- state
279+
aggregation_type: sum
280+
183281
resourcedetection:
184282
# https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/resourcedetectionprocessor
185283
detectors: [gcp]
@@ -280,7 +378,12 @@ service:
280378
metrics/prometheus:
281379
receivers:
282380
- prometheus
283-
processors: [filter/prometheus, metricstransform, resourcedetection, transform/set-name, batch]
381+
processors:
382+
- filter/prometheus
383+
- metricstransform
384+
- resourcedetection
385+
- transform/set-name
386+
- batch
284387
exporters:
285388
- otlphttp/grafana_cloud
286389
metrics/rpc_only:
@@ -292,7 +395,12 @@ service:
292395
metrics/host:
293396
receivers:
294397
- hostmetrics
295-
processors: [filter/drop_by_device, attributes/strip_fs_labels, attributes/host_metrics_node, batch]
398+
processors:
399+
- filter/drop_by_device
400+
- attributes/strip_fs_labels
401+
- attributes/host_metrics_node
402+
- metricstransform/single_cpu
403+
- batch
296404
exporters:
297405
- otlphttp/grafana_cloud
298406
metrics/external:

iac/provider-gcp/nomad/jobs/otel-collector.hcl

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,6 @@ job "otel-collector" {
6262
]
6363
args = [
6464
"--config=local/config/otel-collector-config.yaml",
65-
"--feature-gates=pkg.translator.prometheus.NormalizeName",
6665
]
6766

6867
ports = [
@@ -91,4 +90,4 @@ EOF
9190
}
9291
}
9392
}
94-
}
93+
}

0 commit comments

Comments
 (0)