From df936e71c4356e533ccf7d791110c47fb5052576 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juraci=20Paix=C3=A3o=20Kr=C3=B6hling?= Date: Thu, 30 May 2024 16:01:32 +0200 Subject: [PATCH] add scalable-tail-sampling recipe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Juraci Paixão Kröhling --- profiling-the-collector/README.md | 2 - scalable-tail-sampling/README.md | 68 ++++++++++++++++++ .../otelcol-loadbalancer.yaml | 50 +++++++++++++ scalable-tail-sampling/otelcol-sampling.yaml | 72 +++++++++++++++++++ 4 files changed, 190 insertions(+), 2 deletions(-) create mode 100644 scalable-tail-sampling/README.md create mode 100644 scalable-tail-sampling/otelcol-loadbalancer.yaml create mode 100644 scalable-tail-sampling/otelcol-sampling.yaml diff --git a/profiling-the-collector/README.md b/profiling-the-collector/README.md index 0aa1016..83dae37 100644 --- a/profiling-the-collector/README.md +++ b/profiling-the-collector/README.md @@ -13,8 +13,6 @@ We are discarding the telemetry data that we are generating, as we are only inte - Pyroscope Agent configured using Grafana Alloy - The `telemetrygen` tool, or any other application that is able to send OTLP data to our collector - The `otelcol-cr.yaml` file from this directory -- A `GRAFANA_CLOUD_USER` environment variable, also known as Grafana Cloud instance ID, found under the instructions for "OpenTelemetry" on your Grafana Cloud stack. -- A `GRAFANA_CLOUD_TOKEN` environment variable, which can be generated under the instructions for "OpenTelemetry" on your Grafana Cloud stack. - A `GRAFANA_CLOUD_PROFILES_USER` environment variable, found under the instructions for "Pyroscope" on your Grafana Cloud stack. - A `GRAFANA_CLOUD_PROFILES_TOKEN` environment variable, which can be generated under the instructions for "Pyroscope" on your Grafana Cloud stack. - The endpoint for your stack diff --git a/scalable-tail-sampling/README.md b/scalable-tail-sampling/README.md new file mode 100644 index 0000000..c2b0882 --- /dev/null +++ b/scalable-tail-sampling/README.md @@ -0,0 +1,68 @@ +# 🍜 Recipe: Scalable tail sampling + +This recipe shows how to prepare a scalable tail sampling pipeline. Tail sampling is a strategy that allows the decision to be made after a trace has had enough time to be completed, and has the ability to use trace-based information to determine whether the trace interesting or not. Because traces are kept in memory, we use a trace ID aware load balancer to consistently route spans belonging to the same trace to the same backing collectors. We have therefore two layers of collectors: one doing the load-balancing, and one doing the sampling. + +We are discarding the telemetry data that we are generating, as we are only interested in assessing this behavior by observing the Collector's metrics. + +**Note:** at this moment, not all metrics are being exported from the Collector using the new OpenTelemetry Metrics exporter. Until that is done, you might want to remove the `telemetry` section of the configuration files and scrape those metrics using a Prometheus-compatible scraper (like another OTel Collector instance with the Prometheus receiver). + +## 🧄 Ingredients + +- OpenTelemetry Operator, see the main [`README.md`](../README.md) for instructions +- The `telemetrygen` tool, or any other application that is able to send OTLP data to our collector +- The `otelcol-load-balancer.yaml` file from this directory +- The `otelcol-sampling.yaml` file from this directory +- A `GRAFANA_CLOUD_USER` environment variable, also known as Grafana Cloud instance ID, found under the instructions for "OpenTelemetry" on your Grafana Cloud stack. +- A `GRAFANA_CLOUD_TOKEN` environment variable, which can be generated under the instructions for "OpenTelemetry" on your Grafana Cloud stack. +- The endpoint for your stack + +## 🥣 Preparation + +1. Create and switch to a namespace for our recipe + ```terminal + kubectl create ns scalable-tail-sampling + kubens scalable-tail-sampling + ``` + +1. Generate a basic auth HTTP header + ```terminal + echo -n "$GRAFANA_CLOUD_USER:$GRAFANA_CLOUD_TOKEN" | base64 -w0 + ``` + +2. Use the output from the command above as the value for the basic auth header on both `otelcol-sampling.yaml` and `otelcol-loadbalancer.yaml` + +3. On the same files, change the `endpoint` parameter for the `otlp` exporter within the `telemetry` node to point to your stack's endpoint, plus the path `/v1/traces` + +4. Install the OTel Collector custom resource + ```terminal + kubectl apply -f otelcol-sampling.yaml + kubectl apply -f otelcol-loadbalancer.yaml + ``` + +5. Open a port-forward to the Collector: + ```terminal + kubectl port-forward svc/otelcol-loadbalancer-collector 4317 + ``` + +6. Send some telemetry to your Collector: those should all be sampled, given that they belong to VIP customers + ```terminal + telemetrygen traces --traces 1_000 --otlp-insecure --otlp-attributes='recipe="scalable-tail-sampling"' --otlp-attributes='vip="true"' + ``` + +7. Open your Grafana instance, go to Explore, select the metrics datasource, and verify how many spans have been received by the load balancer instances by running the following query: `receiver_accepted_spans_total{job="otelcol-loadbalancer"}` + +8. Send some more telemetry to your Collector: only 10% of those should be sampled, as they are not from VIP customers + ```terminal + telemetrygen traces --traces 1_000 --otlp-insecure --otlp-attributes='recipe="scalable-tail-sampling"' --otlp-attributes='vip="false"' + ``` + +9. Verify how many spans have been received by the sampling instances by running the following query: `receiver_accepted_spans_total{job="otelcol-sampling"}` + + +## 😋 Executed last time with these versions + +The most recent execution of this recipe was done with these versions: + +- OpenTelemetry Operator v0.100.1 +- OpenTelemetry Collector Contrib v0.101.0 +- `telemetrygen` v0.101.0 diff --git a/scalable-tail-sampling/otelcol-loadbalancer.yaml b/scalable-tail-sampling/otelcol-loadbalancer.yaml new file mode 100644 index 0000000..342b1bc --- /dev/null +++ b/scalable-tail-sampling/otelcol-loadbalancer.yaml @@ -0,0 +1,50 @@ +apiVersion: opentelemetry.io/v1beta1 +kind: OpenTelemetryCollector +metadata: + name: otelcol-loadbalancer +spec: + image: ghcr.io/jpkrohling/otelcol-distributions/otelcol-loadbalancer:0.101.7 + config: + receivers: + otlp: + protocols: + grpc: {} + + exporters: + loadbalancing: + protocol: + otlp: + tls: + insecure: true + resolver: + dns: + hostname: otelcol-sampling-collector-headless + + service: + extensions: [ ] + pipelines: + traces: + receivers: [ otlp ] + processors: [ ] + exporters: [ loadbalancing ] + logs: + receivers: [ otlp ] + processors: [ ] + exporters: [ loadbalancing ] + metrics: + receivers: [ otlp ] + processors: [ ] + exporters: [ loadbalancing ] + telemetry: + metrics: + level: detailed + readers: + - periodic: + exporter: + otlp: + endpoint: https://otlp-gateway-prod-eu-west-2.grafana.net/otlp/v1/metrics + protocol: http/protobuf + headers: + Authorization: "Basic ..." + resource: + "service.name": "otelcol-loadbalancer" diff --git a/scalable-tail-sampling/otelcol-sampling.yaml b/scalable-tail-sampling/otelcol-sampling.yaml new file mode 100644 index 0000000..1ccc9fe --- /dev/null +++ b/scalable-tail-sampling/otelcol-sampling.yaml @@ -0,0 +1,72 @@ +apiVersion: opentelemetry.io/v1beta1 +kind: OpenTelemetryCollector +metadata: + name: otelcol-sampling +spec: + image: ghcr.io/jpkrohling/otelcol-distributions/otelcol-sampling:0.101.7 + replicas: 10 + config: + receivers: + otlp: + protocols: + grpc: {} + + processors: + tail_sampling: + decision_wait: 1s + num_traces: 50_000 + expected_new_traces_per_sec: 500 + policies: + [ + { + type: and, + and: + { + and_sub_policy: + [ + { + name: only-10-percent, + type: probabilistic, + probabilistic: { sampling_percentage: 10 }, + }, + { + name: vip, + type: string_attribute, + string_attribute: { key: vip, values: ["true"] }, + }, + ], + }, + }, + ] + + exporters: + nop: {} + + service: + extensions: [ ] + pipelines: + traces: + receivers: [ otlp ] + processors: [ ] + exporters: [ nop ] + logs: + receivers: [ otlp ] + processors: [ ] + exporters: [ nop ] + metrics: + receivers: [ otlp ] + processors: [ ] + exporters: [ nop ] + telemetry: + metrics: + level: detailed + readers: + - periodic: + exporter: + otlp: + endpoint: https://otlp-gateway-prod-eu-west-2.grafana.net/otlp/v1/metrics + protocol: http/protobuf + headers: + Authorization: "Basic ..." + resource: + "service.name": "otelcol-sampling"