feat: kube-scheduler local dev (#1116)

skl · web-flow · commit cd40f018875d · 2025-10-23T10:34:03.000+01:00
diff --git a/Makefile b/Makefile
@@ -40,22 +40,23 @@ dev: generate
 
 .PHONY: dev-port-forward
 dev-port-forward:
-	kubectl --context k3d-kubernetes-mixin port-forward service/lgtm 3000:3000 4317:4317 4318:4318 9090:9090
+	kubectl --context kind-kubernetes-mixin wait --for=condition=Ready pods -l app=lgtm --timeout=300s
+	kubectl --context kind-kubernetes-mixin port-forward service/lgtm 3000:3000 4317:4317 4318:4318 9090:9090
 
 dev-reload: generate
 	@cp -v prometheus_alerts.yaml scripts/provisioning/prometheus/ && \
 	cp -v prometheus_rules.yaml scripts/provisioning/prometheus/ && \
-	kubectl --context k3d-kubernetes-mixin rollout restart deployment/lgtm && \
+	kubectl --context kind-kubernetes-mixin rollout restart deployment/lgtm && \
 	echo '╔═══════════════════════════════════════════════════════════════╗' && \
 	echo '║                                                               ║' && \
 	echo '║           🔄 Reloading Alert and Recording Rules...           ║' && \
 	echo '║                                                               ║' && \
 	echo '╚═══════════════════════════════════════════════════════════════╝' && \
-	kubectl --context k3d-kubernetes-mixin rollout status deployment/lgtm
+	kubectl --context kind-kubernetes-mixin rollout status deployment/lgtm
 
 .PHONY: dev-down
 dev-down:
-	k3d cluster delete kubernetes-mixin
+	kind delete cluster --name kubernetes-mixin
 
 .PHONY: generate
 generate: prometheus_alerts.yaml prometheus_rules.yaml $(OUT_DIR)
diff --git a/README.md b/README.md
@@ -2,10 +2,41 @@
 
 [![ci](https://github.com/kubernetes-monitoring/kubernetes-mixin/actions/workflows/ci.yaml/badge.svg)](https://github.com/kubernetes-monitoring/kubernetes-mixin/actions/workflows/ci.yaml)
 
-> NOTE: This project is *pre-release* stage. Flags, configuration, behaviour and design may change significantly in following releases.
-
 A set of Grafana dashboards and Prometheus alerts for Kubernetes.
 
+## Local development
+
+Run the following command to setup a local [kind](https://kind.sigs.k8s.io) cluster:
+
+```shell
+make dev
+```
+
+You should see the following output if successful:
+
+```shell
+╔═══════════════════════════════════════════════════════════════╗
+║             🚀 Development Environment Ready! 🚀              ║
+║                                                               ║
+║   Run `make dev-port-forward`                                 ║
+║   Grafana will be available at http://localhost:3000          ║
+║                                                               ║
+║   Data will be available in a few minutes.                    ║
+║                                                               ║
+║   Dashboards will refresh every 10s, run `make generate`      ║
+║   and refresh your browser to see the changes.                ║
+║                                                               ║
+║   Alert and recording rules require `make dev-reload`.        ║
+║                                                               ║
+╚═══════════════════════════════════════════════════════════════╝
+```
+
+To delete the cluster, run the following:
+
+```shell
+make dev-down
+```
+
 ## Releases
 
 > Note: Releases up until `release-0.12` are changes in their own branches. Changelogs are included in releases starting from [version-0.13.0](https://github.com/kubernetes-monitoring/kubernetes-mixin/releases/tag/version-0.13.0).
@@ -33,7 +64,7 @@ Some alerts now use Prometheus filters made available in Prometheus 2.11.0, whic
 
 Warning: This compatibility matrix was initially created based on experience, we do not guarantee the compatibility, it may be updated based on new learnings.
 
-Warning: By default the expressions will generate *grafana 7.2+* compatible rules using the *$__rate_interval* variable for rate functions. If you need backward compatible rules please set *grafana72: false* in your *_config*
+Warning: By default the expressions will generate *grafana 7.2+* compatible rules using the *$\_\_rate_interval* variable for rate functions. If you need backward compatible rules please set *grafana72: false* in your *\_config*
 
 ### Release steps
 
@@ -75,6 +106,7 @@ node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m
 This mixin is designed to be vendored into the repo with your infrastructure config. To do this, use [jsonnet-bundler](https://github.com/jsonnet-bundler/jsonnet-bundler):
 
 You then have three options for deploying your dashboards
+
 1. Generate the config files and deploy them yourself
 2. Use ksonnet to deploy this mixin along with Prometheus and Grafana
 3. Use prometheus-operator to deploy this mixin (TODO)
@@ -109,11 +141,12 @@ The `prometheus_alerts.yaml` and `prometheus_rules.yaml` file then need to passe
 ### Dashboards for Windows Nodes
 
 There exist separate dashboards for windows resources.
-1) Compute Resources / Cluster(Windows)
-2) Compute Resources / Namespace(Windows)
-3) Compute Resources / Pod(Windows)
-4) USE Method / Cluster(Windows)
-5) USE Method / Node(Windows)
+
+1. Compute Resources / Cluster(Windows)
+2. Compute Resources / Namespace(Windows)
+3. Compute Resources / Pod(Windows)
+4. USE Method / Cluster(Windows)
+5. USE Method / Node(Windows)
 
 These dashboards are based on metrics populated by [windows-exporter](https://github.com/prometheus-community/windows_exporter) from each Windows node.
 
@@ -270,14 +303,14 @@ Same result can be achieved by modyfying the existing `config.libsonnet` with th
 
 While the community has not yet fully agreed on alert severities and their to be used, this repository assumes the following paradigms when setting the severities:
 
-* Critical: An issue, that needs to page a person to take instant action
-* Warning: An issue, that needs to be worked on but in the regular work queue or for during office hours rather than paging the oncall
-* Info: Is meant to support a trouble shooting process by informing about a non-normal situation for one or more systems but not worth a page or ticket on its own.
+- Critical: An issue, that needs to page a person to take instant action
+- Warning: An issue, that needs to be worked on but in the regular work queue or for during office hours rather than paging the oncall
+- Info: Is meant to support a trouble shooting process by informing about a non-normal situation for one or more systems but not worth a page or ticket on its own.
 
 ### Architecture and Technical Decisions
 
-* For more motivation, see "[The RED Method: How to instrument your services](https://kccncna17.sched.com/event/CU8K/the-red-method-how-to-instrument-your-services-b-tom-wilkie-kausal?iframe=no&w=100%&sidebar=yes&bg=no)" talk from CloudNativeCon Austin.
-* For more information about monitoring mixins, see this [design doc](DESIGN.md).
+- For more motivation, see "[The RED Method: How to instrument your services](https://kccncna17.sched.com/event/CU8K/the-red-method-how-to-instrument-your-services-b-tom-wilkie-kausal?iframe=no&w=100%&sidebar=yes&bg=no)" talk from CloudNativeCon Austin.
+- For more information about monitoring mixins, see this [design doc](DESIGN.md).
 
 ## Note
 
diff --git a/scripts/lgtm.sh b/scripts/lgtm.sh
@@ -1,27 +1,47 @@
 #!/bin/bash
-
 set -ex
 
-# export time in milliseconds
-# export OTEL_METRIC_EXPORT_INTERVAL=500
-
-# use http instead of https (needed because of https://github.com/open-telemetry/opentelemetry-go/issues/4834)
-# export OTEL_EXPORTER_OTLP_INSECURE="true"
-
-# https://github.com/grafana/docker-otel-lgtm/tree/main/examples
-
-# docker run -p 3001:3000 -p 4317:4317 -p 4318:4318 \
-#     -v ./provisioning/dashboards:/otel-lgtm/grafana/conf/provisioning/dashboards \
-#     -v ../dashboards_out:/kubernetes-mixin/dashboards_out \
-#     --rm -ti grafana/otel-lgtm
-
 cp ../prometheus_alerts.yaml provisioning/prometheus/
 cp ../prometheus_rules.yaml provisioning/prometheus/
 
-# set up 1-node k3d cluster
-k3d cluster create kubernetes-mixin \
-    -v "$PWD"/provisioning:/kubernetes-mixin/provisioning \
-    -v "$PWD"/../dashboards_out:/kubernetes-mixin/dashboards_out
+# Create kind cluster with kube-scheduler resource metrics enabled
+kind create cluster --name kubernetes-mixin --config - <<EOF
+kind: Cluster
+apiVersion: kind.x-k8s.io/v1alpha4
+nodes:
+  - role: control-plane
+    kubeadmConfigPatches:
+      - |
+        kind: ClusterConfiguration
+        scheduler:
+          extraArgs:
+            authorization-always-allow-paths: "/metrics,/metrics/resources"
+            bind-address: "0.0.0.0"
+    extraMounts:
+    - hostPath: "$PWD/provisioning"
+      containerPath: /kubernetes-mixin/provisioning
+    - hostPath: "$PWD/../dashboards_out"
+      containerPath: /kubernetes-mixin/dashboards_out
+EOF
+
+# Wait for cluster to be ready
+kubectl wait --for=condition=Ready nodes --all --timeout=300s
+
+# Create kube-scheduler service for metrics access
+kubectl apply -f - <<EOF
+apiVersion: v1
+kind: Service
+metadata:
+  name: kube-scheduler
+  namespace: kube-system
+spec:
+  selector:
+    component: kube-scheduler
+  ports:
+  - port: 10259
+    targetPort: 10259
+    protocol: TCP
+EOF
 
 # run grafana, prometheus
 kubectl apply -f lgtm.yaml
diff --git a/scripts/otel-collector-deployment.values.yaml b/scripts/otel-collector-deployment.values.yaml
@@ -29,6 +29,7 @@ clusterRole:
         - watch
     - nonResourceURLs:
         - /metrics
+        - /metrics/resources
       verbs:
         - get
     - apiGroups:
@@ -130,7 +131,36 @@ config:
                 target_label: instance
               - source_labels: [__meta_kubernetes_namespace]
                 target_label: namespace
-  
+
+          - job_name: kube-scheduler
+            kubernetes_sd_configs:
+              - role: service
+            relabel_configs:
+              - source_labels: [__meta_kubernetes_service_name]
+                action: keep
+                regex: kube-scheduler
+              - source_labels: [__meta_kubernetes_namespace]
+                action: keep
+                regex: kube-system
+            scheme: https
+            tls_config:
+              insecure_skip_verify: true
+
+          - job_name: kube-scheduler-resources
+            kubernetes_sd_configs:
+              - role: service
+            relabel_configs:
+              - source_labels: [__meta_kubernetes_service_name]
+                action: keep
+                regex: kube-scheduler
+              - source_labels: [__meta_kubernetes_namespace]
+                action: keep
+                regex: kube-system
+            metrics_path: /metrics/resources
+            scheme: https
+            tls_config:
+              insecure_skip_verify: true
+
   processors:
     batch: {}