From df936e71c4356e533ccf7d791110c47fb5052576 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juraci=20Paix=C3=A3o=20Kr=C3=B6hling?= <juraci@kroehling.de>
Date: Thu, 30 May 2024 16:01:32 +0200
Subject: [PATCH] add scalable-tail-sampling recipe
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Juraci Paixão Kröhling <juraci@kroehling.de>
---
 profiling-the-collector/README.md             |  2 -
 scalable-tail-sampling/README.md              | 68 ++++++++++++++++++
 .../otelcol-loadbalancer.yaml                 | 50 +++++++++++++
 scalable-tail-sampling/otelcol-sampling.yaml  | 72 +++++++++++++++++++
 4 files changed, 190 insertions(+), 2 deletions(-)
 create mode 100644 scalable-tail-sampling/README.md
 create mode 100644 scalable-tail-sampling/otelcol-loadbalancer.yaml
 create mode 100644 scalable-tail-sampling/otelcol-sampling.yaml

diff --git a/profiling-the-collector/README.md b/profiling-the-collector/README.md
index 0aa1016..83dae37 100644
--- a/profiling-the-collector/README.md
+++ b/profiling-the-collector/README.md
@@ -13,8 +13,6 @@ We are discarding the telemetry data that we are generating, as we are only inte
 - Pyroscope Agent configured using Grafana Alloy
 - The `telemetrygen` tool, or any other application that is able to send OTLP data to our collector
 - The `otelcol-cr.yaml` file from this directory
-- A `GRAFANA_CLOUD_USER` environment variable, also known as Grafana Cloud instance ID, found under the instructions for "OpenTelemetry" on your Grafana Cloud stack.
-- A `GRAFANA_CLOUD_TOKEN` environment variable, which can be generated under the instructions for "OpenTelemetry" on your Grafana Cloud stack.
 - A `GRAFANA_CLOUD_PROFILES_USER` environment variable, found under the instructions for "Pyroscope" on your Grafana Cloud stack.
 - A `GRAFANA_CLOUD_PROFILES_TOKEN` environment variable, which can be generated under the instructions for "Pyroscope" on your Grafana Cloud stack.
 - The endpoint for your stack
diff --git a/scalable-tail-sampling/README.md b/scalable-tail-sampling/README.md
new file mode 100644
index 0000000..c2b0882
--- /dev/null
+++ b/scalable-tail-sampling/README.md
@@ -0,0 +1,68 @@
+# 🍜 Recipe: Scalable tail sampling
+
+This recipe shows how to prepare a scalable tail sampling pipeline. Tail sampling is a strategy that allows the decision to be made after a trace has had enough time to be completed, and has the ability to use trace-based information to determine whether the trace interesting or not. Because traces are kept in memory, we use a trace ID aware load balancer to consistently route spans belonging to the same trace to the same backing collectors. We have therefore two layers of collectors: one doing the load-balancing, and one doing the sampling.
+
+We are discarding the telemetry data that we are generating, as we are only interested in assessing this behavior by observing the Collector's metrics.
+
+**Note:** at this moment, not all metrics are being exported from the Collector using the new OpenTelemetry Metrics exporter. Until that is done, you might want to remove the `telemetry` section of the configuration files and scrape those metrics using a Prometheus-compatible scraper (like another OTel Collector instance with the Prometheus receiver).
+
+## 🧄 Ingredients
+
+- OpenTelemetry Operator, see the main [`README.md`](../README.md) for instructions
+- The `telemetrygen` tool, or any other application that is able to send OTLP data to our collector
+- The `otelcol-load-balancer.yaml` file from this directory
+- The `otelcol-sampling.yaml` file from this directory
+- A `GRAFANA_CLOUD_USER` environment variable, also known as Grafana Cloud instance ID, found under the instructions for "OpenTelemetry" on your Grafana Cloud stack.
+- A `GRAFANA_CLOUD_TOKEN` environment variable, which can be generated under the instructions for "OpenTelemetry" on your Grafana Cloud stack.
+- The endpoint for your stack
+
+## 🥣 Preparation
+
+1. Create and switch to a namespace for our recipe
+   ```terminal
+    kubectl create ns scalable-tail-sampling
+    kubens scalable-tail-sampling
+   ```
+
+1. Generate a basic auth HTTP header
+   ```terminal
+   echo -n "$GRAFANA_CLOUD_USER:$GRAFANA_CLOUD_TOKEN" | base64 -w0
+   ```
+
+2. Use the output from the command above as the value for the basic auth header on both `otelcol-sampling.yaml` and `otelcol-loadbalancer.yaml`
+
+3. On the same files, change the `endpoint` parameter for the `otlp` exporter within the `telemetry` node to point to your stack's endpoint, plus the path `/v1/traces`
+
+4. Install the OTel Collector custom resource
+   ```terminal
+   kubectl apply -f otelcol-sampling.yaml
+   kubectl apply -f otelcol-loadbalancer.yaml
+   ```
+
+5. Open a port-forward to the Collector:
+   ```terminal
+   kubectl port-forward svc/otelcol-loadbalancer-collector 4317
+   ```
+
+6. Send some telemetry to your Collector: those should all be sampled, given that they belong to VIP customers
+   ```terminal
+   telemetrygen traces --traces 1_000 --otlp-insecure --otlp-attributes='recipe="scalable-tail-sampling"' --otlp-attributes='vip="true"'
+   ```
+
+7. Open your Grafana instance, go to Explore, select the metrics datasource, and verify how many spans have been received by the load balancer instances by running the following query: `receiver_accepted_spans_total{job="otelcol-loadbalancer"}`
+
+8. Send some more telemetry to your Collector: only 10% of those should be sampled, as they are not from VIP customers
+   ```terminal
+   telemetrygen traces --traces 1_000 --otlp-insecure --otlp-attributes='recipe="scalable-tail-sampling"' --otlp-attributes='vip="false"'
+   ```
+
+9. Verify how many spans have been received by the sampling instances by running the following query: `receiver_accepted_spans_total{job="otelcol-sampling"}`
+
+
+## 😋 Executed last time with these versions
+
+The most recent execution of this recipe was done with these versions:
+
+- OpenTelemetry Operator v0.100.1
+- OpenTelemetry Collector Contrib v0.101.0
+- `telemetrygen` v0.101.0
diff --git a/scalable-tail-sampling/otelcol-loadbalancer.yaml b/scalable-tail-sampling/otelcol-loadbalancer.yaml
new file mode 100644
index 0000000..342b1bc
--- /dev/null
+++ b/scalable-tail-sampling/otelcol-loadbalancer.yaml
@@ -0,0 +1,50 @@
+apiVersion: opentelemetry.io/v1beta1
+kind: OpenTelemetryCollector
+metadata:
+  name: otelcol-loadbalancer
+spec:
+  image: ghcr.io/jpkrohling/otelcol-distributions/otelcol-loadbalancer:0.101.7
+  config:
+    receivers:
+      otlp:
+        protocols:
+          grpc: {}
+
+    exporters:
+      loadbalancing:
+        protocol:
+          otlp:
+            tls:
+              insecure: true
+        resolver:
+          dns:
+            hostname: otelcol-sampling-collector-headless
+
+    service:
+      extensions: [ ]
+      pipelines:
+        traces:
+          receivers:  [ otlp ]
+          processors: [  ]
+          exporters:  [ loadbalancing ]
+        logs:
+          receivers:  [ otlp ]
+          processors: [  ]
+          exporters:  [ loadbalancing ]
+        metrics:
+          receivers:  [ otlp ]
+          processors: [  ]
+          exporters:  [ loadbalancing ]
+      telemetry:
+        metrics:
+          level: detailed
+          readers:
+            - periodic:
+                exporter:
+                  otlp:
+                    endpoint: https://otlp-gateway-prod-eu-west-2.grafana.net/otlp/v1/metrics
+                    protocol: http/protobuf
+                    headers:
+                      Authorization: "Basic ..."
+        resource:
+          "service.name": "otelcol-loadbalancer"
diff --git a/scalable-tail-sampling/otelcol-sampling.yaml b/scalable-tail-sampling/otelcol-sampling.yaml
new file mode 100644
index 0000000..1ccc9fe
--- /dev/null
+++ b/scalable-tail-sampling/otelcol-sampling.yaml
@@ -0,0 +1,72 @@
+apiVersion: opentelemetry.io/v1beta1
+kind: OpenTelemetryCollector
+metadata:
+  name: otelcol-sampling
+spec:
+  image: ghcr.io/jpkrohling/otelcol-distributions/otelcol-sampling:0.101.7
+  replicas: 10
+  config:
+    receivers:
+      otlp:
+        protocols:
+          grpc: {}
+
+    processors:
+      tail_sampling:
+        decision_wait: 1s
+        num_traces: 50_000
+        expected_new_traces_per_sec: 500
+        policies:
+          [
+            {
+              type: and,
+              and:
+                {
+                  and_sub_policy:
+                    [
+                      {
+                        name: only-10-percent,
+                        type: probabilistic,
+                        probabilistic: { sampling_percentage: 10 },
+                      },
+                      {
+                        name: vip,
+                        type: string_attribute,
+                        string_attribute: { key: vip, values: ["true"] },
+                      },
+                    ],
+                },
+            },
+          ]
+
+    exporters:
+      nop: {}
+
+    service:
+      extensions: [ ]
+      pipelines:
+        traces:
+          receivers:  [ otlp ]
+          processors: [  ]
+          exporters:  [ nop ]
+        logs:
+          receivers:  [ otlp ]
+          processors: [  ]
+          exporters:  [ nop ]
+        metrics:
+          receivers:  [ otlp ]
+          processors: [  ]
+          exporters:  [ nop ]
+      telemetry:
+        metrics:
+          level: detailed
+          readers:
+            - periodic:
+                exporter:
+                  otlp:
+                    endpoint: https://otlp-gateway-prod-eu-west-2.grafana.net/otlp/v1/metrics
+                    protocol: http/protobuf
+                    headers:
+                      Authorization: "Basic ..."
+        resource:
+          "service.name": "otelcol-sampling"