Skip to content

Commit b95f2bb

Browse files
Add prometheus deployment to autojoin cluster (#1057)
* Rename deploy script * Make deployment steps generic * Move to per-cluster deployment * Add cluster name as template parameter * Add autojoin cluster parameters * Update build steps for multiple clusters and projects * Add Helm config for autojoin cluster with public IPs * Add byos to autojoin cluster prometheus config * Remove byos config from prometheus-federation * Add per cluster config for autojoin * Make deployments conditional
1 parent 5e0b575 commit b95f2bb

36 files changed

+771
-59
lines changed

apply-data-pipeline.sh renamed to apply-cluster.sh

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,12 @@ source config.sh
1616

1717
# Replace the template variables.
1818
sed -e 's|{{CLUSTER}}|'${CLUSTER}'|g' \
19-
config/cluster/prometheus/prometheus.yml.template > \
20-
config/cluster/prometheus/prometheus.yml
19+
config/${CLUSTER}/prometheus/prometheus.yml.template > \
20+
config/${CLUSTER}/prometheus/prometheus.yml
2121

2222
# Prometheus config map.
2323
kubectl create configmap prometheus-cluster-config \
24-
--from-file=config/cluster/prometheus \
24+
--from-file=config/${CLUSTER}/prometheus \
2525
--dry-run="client" -o json | kubectl apply -f -
2626

2727
kubectl create secret generic prometheus-auth \
@@ -32,7 +32,7 @@ kubectl create secret generic prometheus-auth \
3232
sed -i -e 's|{{OAUTH_PROXY_CLIENT_ID}}|'${!OAUTH_PROXY_CLIENT_ID}'|g' \
3333
-e 's|{{OAUTH_PROXY_CLIENT_SECRET}}|'${!OAUTH_PROXY_CLIENT_SECRET}'|g' \
3434
-e 's|{{OAUTH_PROXY_COOKIE_SECRET}}|'${!OAUTH_PROXY_COOKIE_SECRET}'|g' \
35-
k8s/data-pipeline/deployments/oauth2-proxy.yml
35+
k8s/${CLUSTER}/deployments/oauth2-proxy.yml
3636

3737
# Additional k8s resources installed via Helm
3838
#
@@ -41,7 +41,7 @@ kubectl create namespace ingress-nginx --dry-run="client" -o json | kubectl appl
4141
./linux-amd64/helm upgrade --install ingress-nginx ingress-nginx/ingress-nginx \
4242
--namespace ingress-nginx \
4343
--version ${K8S_INGRESS_NGINX_VERSION} \
44-
--values helm/data-pipeline/ingress-nginx/${PROJECT}.yml
44+
--values helm/${CLUSTER}/ingress-nginx/${PROJECT}.yml
4545

4646

4747
# Install cert-manager.
@@ -59,7 +59,7 @@ kubectl create namespace ingress-nginx --dry-run="client" -o json | kubectl appl
5959
--set installCRDs=true \
6060
--set ingressShim.defaultIssuerKind=ClusterIssuer \
6161
--set ingressShim.defaultIssuerName=letsencrypt
62-
62+
6363
# Check for per-project template variables.
6464
if [[ ! -f "k8s/${CLUSTER}/${PROJECT}.yml" ]] ; then
6565
echo "No template variables found for k8s/${CLUSTER}/${PROJECT}.yml"

cloudbuild.yaml

Lines changed: 32 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -149,25 +149,35 @@ steps:
149149
# Check all JSON files, mostly (likely only) provisioned Grafana dashboards
150150
find . -type f -name '*.json' | xargs jsonlint-php -q
151151
152-
# Check alert and recording rules
153-
promtool check rules ./config/federation/prometheus/alerts.yml
154-
promtool check rules ./config/federation/prometheus/rules.yml
155-
156-
export CLUSTER=prometheus-federation
157-
158-
# Get cluster credentials for the prometheus-federation cluster
159-
gcloud container clusters get-credentials $$CLUSTER --project $$PROJECT --zone $$(get_cluster_zone $$CLUSTER)
160-
161-
# Apply various things in the prometheus-federation cluster
162-
./apply-global-prometheus.sh
163-
./apply-grafana-dashboards.sh
164-
./deploy-prometheus-targets.sh $$PROJECT
165-
166-
export CLUSTER=data-pipeline
167-
168-
# Get cluster credentials for the data-pipeline cluster
169-
gcloud container clusters get-credentials $$CLUSTER --project $$PROJECT --zone $$(get_cluster_zone $$CLUSTER)
170-
./apply-data-pipeline.sh
171-
172-
# Deploy the IPv6 monitoring BBE configs to the IPv6 Linode.
173-
./deploy_bbe_config.sh $$PROJECT LINODE_PRIVATE_KEY_ipv6_monitoring
152+
# TODO(soltesz): Separate configuration steps so we can use cbif conditions.
153+
if [[ $$PROJECT = "mlab-sandbox" || $$PROJECT = "mlab-staging" || $$PROJECT = "mlab-oti" ]] ; then
154+
# Check alert and recording rules
155+
promtool check rules ./config/federation/prometheus/alerts.yml
156+
promtool check rules ./config/federation/prometheus/rules.yml
157+
158+
export CLUSTER=prometheus-federation
159+
160+
# Get cluster credentials for the prometheus-federation cluster
161+
gcloud container clusters get-credentials $$CLUSTER --project $$PROJECT --zone $$(get_cluster_zone $$CLUSTER)
162+
163+
# Apply various things in the prometheus-federation cluster
164+
./apply-global-prometheus.sh
165+
./apply-grafana-dashboards.sh
166+
./deploy-prometheus-targets.sh $$PROJECT
167+
168+
# Get cluster credentials for the data-pipeline cluster
169+
export CLUSTER=data-pipeline
170+
gcloud container clusters get-credentials $$CLUSTER --project $$PROJECT --zone $$(get_cluster_zone $$CLUSTER)
171+
./apply-cluster.sh
172+
173+
# Deploy the IPv6 monitoring BBE configs to the IPv6 Linode.
174+
./deploy_bbe_config.sh $$PROJECT LINODE_PRIVATE_KEY_ipv6_monitoring
175+
fi
176+
177+
# TODO(soltesz): Separate configuration steps so we can use cbif conditions.
178+
if [[ $$PROJECT = "mlab-sandbox" || $$PROJECT = "mlab-staging" || $$PROJECT = "mlab-autojoin" ]] ; then
179+
export CLUSTER=autojoin
180+
# Get cluster credentials for the autojoin cluster
181+
gcloud container clusters get-credentials $$CLUSTER --project $$PROJECT --zone $$(get_cluster_zone $$CLUSTER)
182+
./apply-cluster.sh
183+
fi
Lines changed: 268 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,268 @@
1+
# M-Lab Prometheus configuration.
2+
3+
global:
4+
scrape_interval: 60s # Set the scrape interval to every 60 seconds.
5+
evaluation_interval: 60s # Evaluate rules every 60 seconds.
6+
# scrape_timeout is set to the global default (10s).
7+
8+
# These labels are attached to any time series or alert sent to external
9+
# systems (federation, remote storage, Alertmanager).
10+
# TODO(soltesz): use this when M-Lab adds federation or alertmanager.
11+
external_labels:
12+
cluster: {{CLUSTER}}
13+
14+
15+
# Load rules once and periodically evaluate them according to the global
16+
# 'evaluation_interval'.
17+
rule_files:
18+
# - /etc/prometheus/rules.yml
19+
20+
# Scrape configurations.
21+
#
22+
# Each job name defines monitoring targets (or a method for discovering
23+
# targets).
24+
#
25+
# The M-Lab Prometheus configuration uses three config types:
26+
# * automatically discovered services via kubernetes (kubernetes_sd_config)
27+
# * automatically discovered services via file (file_sd_config)
28+
# * static targets (static_config)
29+
#
30+
# Kubernetes targets are discovered automatically by querying the kubernetes
31+
# master API. The configuration for this is simplest when Prometheus runs in
32+
# the same cluster as the kubernetes master being monitored. In particular,
33+
# the master CA certificates and an authentication token are mounted
34+
# automatically in every container's filesystem for easy access.
35+
#
36+
# Discovery of legacy targets occurs by reading a configuration file. This
37+
# configuration file can be updated out of band after start and Prometheus will
38+
# periodically re-read the contents, adding new targets or removing old ones.
39+
#
40+
# Static targets cannot change after Prometheus starts. They are the least
41+
# flexible. Because of this, only well known, or long lived targets, or
42+
# singleton targets that need special relabeling rules should be static.
43+
scrape_configs:
44+
45+
# Kubernetes configurations were inspired by:
46+
# https://github.com/prometheus/prometheus/blob/main/documentation/examples
47+
#
48+
# The four kubernetes scrape configs correspond to specific cluster
49+
# components.
50+
# * master API
51+
# * cluster nodes
52+
# * pods
53+
# * service endpoints
54+
#
55+
# The separation allows each component to use different authentication
56+
# configs, or apply different relabeling rules.
57+
58+
# Scrape config for kubernetes master API server.
59+
#
60+
# The kubernetes API is exposed as an "endpoint". Since kubernetes may have
61+
# many endpoints, this configuration restricts the targets monitored to the
62+
# default/kubernetes service. The relabeling rules ignore other endpoints.
63+
- job_name: 'kubernetes-apiservers'
64+
kubernetes_sd_configs:
65+
- role: endpoints
66+
67+
# The kubernetes API requires authentication and uses a privately signed
68+
# certificate. The tls_config specifies the private CA cert and an
69+
# auth token. Kubernetes automatically mounts these files in the container
70+
# filesystem.
71+
scheme: https
72+
tls_config:
73+
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
74+
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
75+
76+
# The source_labels are concatenated with ';'. The regex matches a single
77+
# value for the default kubernetes service endpoint. If there are
78+
# multiple API servers, all will match this pattern.
79+
relabel_configs:
80+
- source_labels: [__meta_kubernetes_namespace,
81+
__meta_kubernetes_service_name,
82+
__meta_kubernetes_endpoint_port_name]
83+
action: keep
84+
regex: default;kubernetes;https
85+
86+
87+
# Scrape config for kubernetes nodes.
88+
#
89+
# A kubernetes cluster consists of one or more nodes. Each reports metrics
90+
# related to the whole machine.
91+
- job_name: 'kubernetes-nodes'
92+
kubernetes_sd_configs:
93+
- role: node
94+
95+
scheme: https
96+
tls_config:
97+
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
98+
99+
# Nodes are discovered and scrapped using the kubernetes internal network
100+
# IP. Unfortunately, the certificates do not validate on requests:
101+
#
102+
# "x509: cannot validate certificate for 10.0.4.126 because it doesn't
103+
# contain any IP SANs"
104+
#
105+
# This is a known issue without a likely solution for private APIs:
106+
# https://github.com/prometheus/prometheus/issues/1822
107+
#
108+
# Since these IPs are internal to the kubernetes virtual network, it
109+
# should be safe to skip certificate verification.
110+
insecure_skip_verify: true
111+
# TODO(soltesz): if we skip_verify, do we still need the bearer token?
112+
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
113+
114+
# Copy node labels from kubernetes to labels on the Prometheus metrics.
115+
# TODO(soltesz): There are many labels. Some look unnecessary. Restrict
116+
# pattern to match helpful labels.
117+
relabel_configs:
118+
- action: labelmap
119+
regex: __meta_kubernetes_node_label_(.+)
120+
# Node /metrics in v1.6+ are accessible via a proxy through the
121+
# kubernetes api server. So, we must update the target and metric path.
122+
- target_label: __address__
123+
replacement: kubernetes.default.svc:443
124+
- source_labels: [__meta_kubernetes_node_name]
125+
regex: (.+)
126+
target_label: __metrics_path__
127+
replacement: /api/v1/nodes/${1}/proxy/metrics
128+
129+
130+
# Scrape config for kubernetes pods.
131+
#
132+
# Kubernetes pods are scraped when they have an annotation:
133+
# `prometheus.io/scrape=true`.
134+
#
135+
# Only container that include an explicit containerPort declaration are
136+
# scraped. For example:
137+
#
138+
# ports:
139+
# - containerPort: 9090
140+
#
141+
# Configuration expects the default HTTP protocol scheme.
142+
# Configuration expects the default path of /metrics on targets.
143+
- job_name: 'kubernetes-pods'
144+
kubernetes_sd_configs:
145+
- role: pod
146+
147+
relabel_configs:
148+
# For inventory, record whether a pod is ready. This helps distinguish
149+
# between: missing from inventory, not ready and failing, ready but
150+
# failing, ready and working.
151+
# and working.
152+
- source_labels: [__meta_kubernetes_pod_ready]
153+
action: replace
154+
target_label: ready
155+
156+
# Check for the prometheus.io/scrape=true annotation.
157+
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
158+
action: keep
159+
regex: true
160+
161+
# Only keep containers that have a declared container port.
162+
- source_labels: [__meta_kubernetes_pod_container_port_number]
163+
action: keep
164+
regex: (\d+)
165+
166+
# Copy all pod labels from kubernetes to the Prometheus metrics.
167+
- action: labelmap
168+
regex: __meta_kubernetes_pod_label_(.+)
169+
170+
# Add the kubernetes namespace as a Prometheus label.
171+
- source_labels: [__meta_kubernetes_namespace]
172+
action: replace
173+
target_label: namespace
174+
175+
# Extract the "<cluster>-<node-pool>" name from the GKE node name.
176+
- source_labels: [__meta_kubernetes_pod_node_name]
177+
action: replace
178+
regex: gke-(.*)(-[^-]+){2}
179+
replacement: $1
180+
target_label: nodepool
181+
182+
# Identify the deployment name for replica set or daemon set. Pods
183+
# created by deployments or daemon sets are processed here. The
184+
# following two rules recognize these two cases.
185+
#
186+
# 1: For DaemonSet, remove the last 5-digit pod name hash.
187+
# e.g. node-exporter-ltxgz
188+
- source_labels: [__meta_kubernetes_pod_controller_kind, __meta_kubernetes_pod_name]
189+
action: replace
190+
regex: DaemonSet;(.*)(-[^-]{5})
191+
replacement: $1
192+
target_label: deployment
193+
194+
# 2: For ReplicaSet, remove the last 10-digit + 5-digit pod name hash.
195+
# In the case of a daemon set that does not have the trailing hash, the
196+
# regex will not match and deployment remains unchanged.
197+
# e.g. prometheus-server-3165440997-ppf9w
198+
- source_labels: [__meta_kubernetes_pod_controller_kind, __meta_kubernetes_pod_name]
199+
action: replace
200+
regex: ReplicaSet;(.*)(-[^-]+)(-[^-]{5})
201+
replacement: $1
202+
target_label: deployment
203+
204+
# TODO(soltesz): evaluate and remove from config if no-pod name is helpful
205+
# in practice.
206+
#
207+
# Add the kubernetes pod name.
208+
#- source_labels: [__meta_kubernetes_pod_name]
209+
# action: replace
210+
# target_label: pod
211+
212+
# Add the kubernetes pod container name.
213+
- source_labels: [__meta_kubernetes_pod_container_name]
214+
action: replace
215+
target_label: container
216+
217+
218+
# Scrape config for kubernetes service endpoints.
219+
#
220+
# Service endpoints are scraped when they have an annotation:
221+
# `prometheus.io/scrape=true`.
222+
#
223+
# Port 80 is sraped by default. To use a different port, use the annotation:
224+
# `prometheus.io/port=9090`.
225+
#
226+
# Configuration expects the default HTTP protocol scheme.
227+
# Configuration expects the default path of /metrics on targets.
228+
- job_name: 'kubernetes-service-endpoints'
229+
kubernetes_sd_configs:
230+
- role: endpoints
231+
232+
relabel_configs:
233+
# Check for the prometheus.io/scrape=true annotation.
234+
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
235+
action: keep
236+
regex: true
237+
# Check for the prometheus.io/port=<port> annotation.
238+
- source_labels: [__address__,
239+
__meta_kubernetes_service_annotation_prometheus_io_port]
240+
action: replace
241+
target_label: __address__
242+
# A google/re2 regex, matching addresses with or without default ports.
243+
# NB: this will not work with IPv6 addresses. But, atm, kubernetes uses
244+
# IPv4 addresses for internal network and GCE doesn not support IPv6.
245+
regex: ([^:]+)(?::\d+)?;(\d+)
246+
replacement: $1:$2
247+
# Copy all service labels from kubernetes to the Prometheus metrics.
248+
- action: labelmap
249+
regex: __meta_kubernetes_service_label_(.+)
250+
# Add the kubernetes namespace as a Prometheus label.
251+
- source_labels: [__meta_kubernetes_namespace]
252+
action: replace
253+
target_label: kubernetes_namespace
254+
# Add the kubernetes service name as a Prometheus label.
255+
- source_labels: [__meta_kubernetes_service_name]
256+
action: replace
257+
target_label: kubernetes_name
258+
259+
260+
# Scrape byos-nodes every minute.
261+
- job_name: 'byos-nodes'
262+
scrape_timeout: 40s
263+
file_sd_configs:
264+
- files:
265+
- /byos-nodes/*.json
266+
# Attempt to re-read files every five minutes.
267+
refresh_interval: 5m
268+
scheme: http

config/federation/prometheus/prometheus.yml.template

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -801,13 +801,3 @@ scrape_configs:
801801
regex: .*
802802
target_label: __address__
803803
replacement: switch-monitoring-service.default.svc.cluster.local:8080
804-
805-
# Scrape byos-nodes every minute.
806-
- job_name: 'byos-nodes'
807-
scrape_timeout: 40s
808-
file_sd_configs:
809-
- files:
810-
- /byos-nodes/*.json
811-
# Attempt to re-read files every five minutes.
812-
refresh_interval: 5m
813-
scheme: http
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
controller:
2+
service:
3+
loadBalancerIP: 34.30.73.176
4+
ingressClassResource:
5+
default: true
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
controller:
2+
service:
3+
loadBalancerIP: 34.30.138.62
4+
ingressClassResource:
5+
default: true

0 commit comments

Comments
 (0)