From d53df1c80066ec3df08b7e7ec7efc90a450b2e86 Mon Sep 17 00:00:00 2001 From: Alexandre Lavigne Date: Fri, 17 Oct 2025 17:51:36 +0200 Subject: [PATCH] CONTINT-4831 - add kubernetes control-plane monitoring dashboard Add a new dashboard to monitor the kubernetes control-plane. Signed-off-by: Alexandre Lavigne --- .../dashboards/kubernetes_control_plane.json | 3044 +++++++++++++++++ kubernetes/manifest.json | 5 +- 2 files changed, 3047 insertions(+), 2 deletions(-) create mode 100644 kubernetes/assets/dashboards/kubernetes_control_plane.json diff --git a/kubernetes/assets/dashboards/kubernetes_control_plane.json b/kubernetes/assets/dashboards/kubernetes_control_plane.json new file mode 100644 index 0000000000000..4186378867171 --- /dev/null +++ b/kubernetes/assets/dashboards/kubernetes_control_plane.json @@ -0,0 +1,3044 @@ +{ + "title": "Kubernetes Control Plane Overview", + "description": "Kubernetes Control-Plane monitoring\n\nThis dashboard show different metrics/logs from kubernetes control-plane components (kube-api server, kube-scheduler, etcd).\n\nThis dashboard helps you monitor kubernetes control-plane.\n\nIt requires the control-plane monitoring checks to be enabled using [this tutorial](https://docs.datadoghq.com/containers/kubernetes/control_plane/?tab=datadogoperator).\n\nFind more detailed monitoring for each component on their dedicated dashboards:\n\n- Kubernetes API server - [here](https://app.datadoghq.com/dash/integration/Kubernetes%20API%20Server%20-%20Overview)\n- Etcd - [here](https://app.datadoghq.com/dash/integration/Etcd%20Overview)\n- Kubernetes Scheduler - [here](https://app.datadoghq.com/dash/integration/kube_scheduler)\n- Kubernetes Controller manager - [here](https://app.datadoghq.com/dash/integration/kube_controller_manager)", + "widgets": [ + { + "id": 7973807627760301, + "definition": { + "title": "", + "banner_img": "/static/images/integration_dashboard/kubernetes_hero_2.jpeg", + "show_title": true, + "type": "group", + "layout_type": "ordered", + "widgets": [ + { + "id": 1396413511120281, + "definition": { + "type": "note", + "content": "## Kubernetes Control-Plane monitoring\n\nThis dashboard show different metrics/logs from kubernetes control-plane components (kube-api server, kube-scheduler, etcd).\n\nThis dashboard helps you monitor kubernetes control-plane.\n\nIt requires the control-plane monitoring checks to be enabled using [this tutorial](https://docs.datadoghq.com/containers/kubernetes/control_plane/?tab=datadogoperator).\n\nFind more detailed monitoring for each component on their dedicated dashboards:\n\n- Kubernetes API server - [here](https://app.datadoghq.com/dash/integration/Kubernetes%20API%20Server%20-%20Overview)\n- Etcd - [here](https://app.datadoghq.com/dash/integration/Etcd%20Overview)\n- Kubernetes Scheduler - [here](https://app.datadoghq.com/dash/integration/kube_scheduler)\n- Kubernetes Controller manager - [here](https://app.datadoghq.com/dash/integration/kube_controller_manager)", + "background_color": "white", + "font_size": "16", + "text_align": "left", + "vertical_align": "top", + "show_tick": false, + "tick_pos": "50%", + "tick_edge": "left", + "has_padding": true + }, + "layout": { + "x": 0, + "y": 0, + "width": 4, + "height": 2 + } + } + ] + }, + "layout": { + "x": 0, + "y": 0, + "width": 4, + "height": 5 + } + }, + { + "id": 8197865697581777, + "definition": { + "title": "API Req/s", + "show_title": true, + "type": "group", + "layout_type": "ordered", + "widgets": [ + { + "id": 2910320272243625, + "definition": { + "title": "requests/s", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:kube_apiserver.apiserver_request_total.count{$kube_cluster_name} by {cluster_name}.as_rate().fill(null)" + }, + { + "data_source": "metrics", + "name": "query2", + "query": "sum:kube_apiserver.apiserver_request.count{$kube_cluster_name} by {cluster_name}.as_rate().fill(null)" + } + ], + "formulas": [ + { + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "request" + } + }, + "formula": "query1" + }, + { + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "request" + } + }, + "formula": "query2" + } + ], + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 0, + "y": 0, + "width": 4, + "height": 3 + } + }, + { + "id": 1617033252778903, + "definition": { + "title": "5XX requests/s", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:kube_apiserver.apiserver_request_total.count{$kube_cluster_name,code:5*} by {cluster_name}.as_rate().fill(null)" + }, + { + "data_source": "metrics", + "name": "query2", + "query": "sum:kube_apiserver.apiserver_request.count{$kube_cluster_name,code:5*} by {cluster_name}.as_rate().fill(null)" + } + ], + "formulas": [ + { + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "request" + } + }, + "formula": "query1" + }, + { + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "request" + } + }, + "formula": "query2" + } + ], + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 4, + "y": 0, + "width": 4, + "height": 3 + } + }, + { + "id": 3180365368978152, + "definition": { + "title": "4XX requests/s", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:kube_apiserver.apiserver_request_total.count{$kube_cluster_name,code:4*} by {cluster_name}.as_rate().fill(null)" + }, + { + "data_source": "metrics", + "name": "query2", + "query": "sum:kube_apiserver.apiserver_request.count{$kube_cluster_name,code:4*} by {cluster_name}.as_rate().fill(null)" + } + ], + "formulas": [ + { + "formula": "query1" + }, + { + "formula": "query2" + } + ], + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 0, + "y": 3, + "width": 4, + "height": 3 + } + }, + { + "id": 6398578832430897, + "definition": { + "title": "429 requests/s", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:kube_apiserver.apiserver_request_total.count{$kube_cluster_name,code:429} by {cluster_name}.as_rate().fill(null)" + }, + { + "data_source": "metrics", + "name": "query2", + "query": "sum:kube_apiserver.apiserver_request.count{$kube_cluster_name,code:429} by {cluster_name}.as_rate().fill(null)" + } + ], + "formulas": [ + { + "formula": "query1" + }, + { + "formula": "query2" + } + ], + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 4, + "y": 3, + "width": 4, + "height": 3 + } + }, + { + "id": 1401319065517756, + "definition": { + "title": "API server average latency", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:kube_apiserver.request_duration_seconds.sum{$kube_cluster_name} by {verb}.fill(null)" + }, + { + "data_source": "metrics", + "name": "query2", + "query": "sum:kube_apiserver.request_duration_seconds.count{$kube_cluster_name} by {verb}.fill(null)" + }, + { + "data_source": "metrics", + "name": "query3", + "query": "sum:kube_apiserver.apiserver_request_duration_seconds.sum{$kube_cluster_name} by {verb}.fill(null)" + }, + { + "data_source": "metrics", + "name": "query4", + "query": "sum:kube_apiserver.apiserver_request_duration_seconds.count{$kube_cluster_name} by {verb}.fill(null)" + } + ], + "formulas": [ + { + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "second" + } + }, + "formula": "query1 / query2" + }, + { + "formula": "query3 / query4" + } + ], + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 0, + "y": 6, + "width": 4, + "height": 3 + } + }, + { + "id": 5271313341845281, + "definition": { + "title": "Etcd total DB storage size", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:kube_apiserver.etcd.db.total_size{$kube_cluster_name} by {cluster_name,storage_cluster_id}.fill(null)" + } + ], + "formulas": [ + { + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "byte_in_decimal_bytes_family" + } + }, + "formula": "query1" + } + ], + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 4, + "y": 6, + "width": 4, + "height": 3 + } + } + ] + }, + "layout": { + "x": 4, + "y": 0, + "width": 8, + "height": 10 + } + }, + { + "id": 7994791376832850, + "definition": { + "title": "AVG requests", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "formulas": [ + { + "formula": "query1 + query2" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:kube_apiserver.apiserver_request_total{$kube_cluster_name}", + "aggregator": "avg" + }, + { + "data_source": "metrics", + "name": "query2", + "query": "sum:kube_apiserver.apiserver_request_total.count{$kube_cluster_name}.as_count()", + "aggregator": "avg" + } + ], + "response_format": "scalar" + } + ], + "autoscale": true, + "precision": 0, + "timeseries_background": { + "yaxis": { + "include_zero": false + }, + "type": "area" + } + }, + "layout": { + "x": 0, + "y": 5, + "width": 4, + "height": 3 + } + }, + { + "id": 8259647833355372, + "definition": { + "title": "Average inflight requests", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "formulas": [ + { + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "request" + } + }, + "formula": "query1" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:kube_apiserver.current_inflight_requests{$kube_cluster_name}", + "aggregator": "avg" + } + ], + "response_format": "scalar" + } + ], + "autoscale": true, + "precision": 0, + "timeseries_background": { + "yaxis": { + "include_zero": true + }, + "type": "area" + } + }, + "layout": { + "x": 0, + "y": 8, + "width": 4, + "height": 2 + } + }, + { + "id": 3478691643697987, + "definition": { + "title": "Kuberbetes API server", + "show_title": true, + "type": "group", + "layout_type": "ordered", + "widgets": [ + { + "id": 8092086570660128, + "definition": { + "title": "Agent check success ratio", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "formulas": [ + { + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "percent" + }, + "unit_scale": { + "type": "canonical_unit", + "unit_name": "percent" + } + }, + "formula": "(query2 / (query2 + query1 + query3)) * 100" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query2", + "query": "sum:check_run.kube_apiserver_controlplane.up.ok{$kube_cluster_name}.as_count()", + "aggregator": "avg" + }, + { + "data_source": "metrics", + "name": "query1", + "query": "sum:check_run.kube_apiserver_controlplane.up.critical{$kube_cluster_name}.as_count()", + "aggregator": "avg" + }, + { + "data_source": "metrics", + "name": "query3", + "query": "sum:check_run.kube_apiserver_controlplane.up.unknown{$kube_cluster_name}.as_count()", + "aggregator": "avg" + } + ], + "response_format": "scalar", + "conditional_formats": [ + { + "comparator": "<", + "value": 95, + "palette": "white_on_red" + }, + { + "comparator": "<", + "value": 100, + "palette": "white_on_yellow" + }, + { + "comparator": ">=", + "value": 100, + "palette": "white_on_green" + } + ] + } + ], + "autoscale": false, + "text_align": "center", + "precision": 2, + "timeseries_background": { + "yaxis": { + "include_zero": false + }, + "type": "area" + } + }, + "layout": { + "x": 0, + "y": 0, + "width": 2, + "height": 3 + } + }, + { + "id": 4984084418648150, + "definition": { + "title": "Etcd API calls success ratio", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "formulas": [ + { + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "percent" + }, + "unit_scale": { + "type": "canonical_unit", + "unit_name": "percent" + } + }, + "formula": "(query2 / (query2 + query1)) * 100" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query2", + "query": "sum:kube_apiserver.etcd_requests_total{$kube_cluster_name}.as_count()", + "aggregator": "last" + }, + { + "data_source": "metrics", + "name": "query1", + "query": "sum:kube_apiserver.etcd_request_errors_total{$kube_cluster_name}.as_count()", + "aggregator": "last" + } + ], + "response_format": "scalar", + "conditional_formats": [ + { + "comparator": "<", + "value": 95, + "palette": "white_on_red" + }, + { + "comparator": "<", + "value": 100, + "palette": "white_on_yellow" + }, + { + "comparator": ">=", + "value": 100, + "palette": "white_on_green" + } + ] + } + ], + "autoscale": false, + "precision": 2, + "timeseries_background": { + "type": "area", + "yaxis": { + "include_zero": false + } + } + }, + "layout": { + "x": 2, + "y": 0, + "width": 2, + "height": 3 + } + }, + { + "id": 3457000436558508, + "definition": { + "title": "API calls by resource", + "title_size": "16", + "title_align": "left", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:kube_apiserver.apiserver_request_total{$kube_cluster_name} by {resource}", + "aggregator": "sum" + } + ], + "style": { + "palette": "datadog16" + }, + "formulas": [ + { + "formula": "query1" + } + ], + "sort": { + "count": 500, + "order_by": [ + { + "type": "formula", + "index": 0, + "order": "desc" + } + ] + } + } + ], + "type": "sunburst", + "legend": { + "type": "automatic" + } + }, + "layout": { + "x": 4, + "y": 0, + "width": 4, + "height": 3 + } + }, + { + "id": 6656866360250061, + "definition": { + "title": "gRPC calls by code and service", + "title_size": "16", + "title_align": "left", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:kube_apiserver.grpc_client_handled_total{$kube_cluster_name} by {grpc_code,grpc_service}.as_count()", + "aggregator": "sum" + } + ], + "style": { + "palette": "datadog16" + }, + "formulas": [ + { + "formula": "query1" + } + ], + "sort": { + "count": 500, + "order_by": [ + { + "type": "formula", + "index": 0, + "order": "desc" + } + ] + } + } + ], + "type": "sunburst", + "legend": { + "type": "automatic" + } + }, + "layout": { + "x": 8, + "y": 0, + "width": 4, + "height": 3 + } + }, + { + "id": 8044026601258795, + "definition": { + "title": "HTTP requests By code", + "title_size": "16", + "title_align": "left", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:kube_apiserver.apiserver_request_total.count{$kube_cluster_name} by {code}.as_count()", + "aggregator": "sum" + } + ], + "style": { + "palette": "datadog16" + }, + "formulas": [ + { + "formula": "query1" + } + ], + "sort": { + "count": 500, + "order_by": [ + { + "type": "formula", + "index": 0, + "order": "desc" + } + ] + } + } + ], + "type": "sunburst", + "legend": { + "type": "automatic" + } + }, + "layout": { + "x": 0, + "y": 3, + "width": 3, + "height": 3 + } + }, + { + "id": 3255384163441516, + "definition": { + "title": "HTTP requests by code per second", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:kube_apiserver.apiserver_request_total.count{$kube_cluster_name} by {code}.as_rate()" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 3, + "y": 3, + "width": 9, + "height": 3 + } + }, + { + "id": 2482207820702128, + "definition": { + "title": "HTTP requests by method", + "title_size": "16", + "title_align": "left", + "requests": [ + { + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:kube_apiserver.rest_client_requests_total.count{$kube_cluster_name} by {method}.as_count()", + "aggregator": "sum" + } + ], + "response_format": "scalar", + "style": { + "palette": "datadog16" + }, + "formulas": [ + { + "formula": "query1" + } + ], + "sort": { + "count": 500, + "order_by": [ + { + "type": "formula", + "index": 0, + "order": "desc" + } + ] + } + } + ], + "type": "sunburst", + "legend": { + "type": "automatic" + } + }, + "layout": { + "x": 0, + "y": 6, + "width": 3, + "height": 3 + } + }, + { + "id": 3870576896721997, + "definition": { + "title": "HTTP requests by method per second", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:kube_apiserver.rest_client_requests_total.count{$kube_cluster_name} by {method}.as_rate()" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 3, + "y": 6, + "width": 9, + "height": 3 + } + }, + { + "id": 3259925679241169, + "definition": { + "title": "Logs per source", + "title_size": "16", + "title_align": "left", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "name": "query1", + "data_source": "logs", + "search": { + "query": "kube_cluster_name:$kube_cluster_name.value AND source:(*kube* OR etcd OR coredns OR containerd)" + }, + "indexes": [ + "*" + ], + "group_by": [ + { + "facet": "source", + "limit": 10, + "sort": { + "aggregation": "count", + "order": "desc", + "metric": "count" + } + } + ], + "compute": { + "aggregation": "count" + }, + "storage": "hot" + } + ], + "style": { + "palette": "datadog16" + }, + "formulas": [ + { + "formula": "query1" + } + ], + "sort": { + "count": 10, + "order_by": [ + { + "type": "formula", + "index": 0, + "order": "desc" + } + ] + } + } + ], + "type": "sunburst", + "legend": { + "type": "automatic" + } + }, + "layout": { + "x": 0, + "y": 9, + "width": 3, + "height": 3 + } + }, + { + "id": 6960234408971312, + "definition": { + "title": "Top 10 most loggers", + "title_size": "16", + "title_align": "left", + "type": "toplist", + "requests": [ + { + "queries": [ + { + "name": "query1", + "data_source": "logs", + "search": { + "query": "kube_cluster_name:$kube_cluster_name.value AND source:(*kube* OR etcd OR coredns OR containerd)" + }, + "indexes": [ + "*" + ], + "group_by": [ + { + "facet": "source", + "limit": 10, + "sort": { + "aggregation": "count", + "order": "desc", + "metric": "count" + }, + "should_exclude_missing": true + } + ], + "compute": { + "aggregation": "count" + }, + "storage": "hot" + } + ], + "response_format": "scalar", + "formulas": [ + { + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "entry" + } + }, + "formula": "query1" + } + ], + "sort": { + "count": 10, + "order_by": [ + { + "type": "formula", + "index": 0, + "order": "desc" + } + ] + } + } + ], + "style": { + "display": { + "type": "stacked", + "legend": "automatic" + }, + "scaling": "relative" + } + }, + "layout": { + "x": 3, + "y": 9, + "width": 4, + "height": 3 + } + }, + { + "id": 5088546305401882, + "definition": { + "title": "Top 10 most talkers - per second", + "title_size": "16", + "title_align": "left", + "type": "toplist", + "requests": [ + { + "queries": [ + { + "name": "query1", + "data_source": "logs", + "search": { + "query": "kube_cluster_name:$kube_cluster_name.value AND source:(*kube* OR etcd OR coredns OR containerd)" + }, + "indexes": [ + "*" + ], + "group_by": [ + { + "facet": "source", + "limit": 10, + "sort": { + "aggregation": "count", + "order": "desc", + "metric": "count" + } + } + ], + "compute": { + "aggregation": "count" + }, + "storage": "hot" + } + ], + "response_format": "scalar", + "formulas": [ + { + "formula": "throughput(query1)" + } + ], + "sort": { + "count": 10, + "order_by": [ + { + "type": "formula", + "index": 0, + "order": "desc" + } + ] + } + } + ], + "style": { + "display": { + "type": "stacked", + "legend": "automatic" + } + } + }, + "layout": { + "x": 7, + "y": 9, + "width": 5, + "height": 3 + } + } + ] + }, + "layout": { + "x": 0, + "y": 10, + "width": 12, + "height": 13 + } + }, + { + "id": 4532967580563887, + "definition": { + "title": "Etcd", + "show_title": true, + "type": "group", + "layout_type": "ordered", + "widgets": [ + { + "id": 1639655680757989, + "definition": { + "title": "Agent check success ration", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "formulas": [ + { + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "percent" + }, + "unit_scale": { + "type": "canonical_unit", + "unit_name": "percent" + } + }, + "formula": "(query1 / (query1 + query2)) * 100" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:check_run.etcd.prometheus.health.ok{$kube_cluster_name}.as_count()", + "aggregator": "avg" + }, + { + "data_source": "metrics", + "name": "query2", + "query": "sum:check_run.etcd.prometheus.health.critical{$kube_cluster_name}.as_count()", + "aggregator": "avg" + } + ], + "response_format": "scalar", + "conditional_formats": [ + { + "comparator": "<", + "value": 95, + "palette": "white_on_red" + }, + { + "comparator": "<", + "value": 100, + "palette": "white_on_yellow" + }, + { + "comparator": ">=", + "value": 100, + "palette": "white_on_green" + } + ] + } + ], + "autoscale": false, + "precision": 2, + "timeseries_background": { + "yaxis": { + "include_zero": false + }, + "type": "area" + } + }, + "layout": { + "x": 0, + "y": 0, + "width": 2, + "height": 3 + } + }, + { + "id": 6400321761844537, + "definition": { + "title": "Etcd average commit duration", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "avg:etcd.disk.backend.commit.duration.seconds.sum{$kube_cluster_name}", + "aggregator": "avg" + }, + { + "data_source": "metrics", + "name": "query2", + "query": "avg:etcd.disk.backend.commit.duration.seconds.count{$kube_cluster_name}", + "aggregator": "avg" + } + ], + "formulas": [ + { + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "second" + } + }, + "formula": "query1 / query2" + } + ] + } + ], + "autoscale": true, + "precision": 2, + "timeseries_background": { + "yaxis": { + "include_zero": false + }, + "type": "area" + } + }, + "layout": { + "x": 2, + "y": 0, + "width": 2, + "height": 3 + } + }, + { + "id": 3154306242916084, + "definition": { + "title": "Msg sent by gRPC service", + "title_size": "16", + "title_align": "left", + "requests": [ + { + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:etcd.grpc.server.msg.sent.total{$kube_cluster_name} by {grpc_service}.as_count()", + "aggregator": "sum" + } + ], + "response_format": "scalar", + "style": { + "palette": "datadog16" + }, + "formulas": [ + { + "formula": "query1" + } + ], + "sort": { + "count": 500, + "order_by": [ + { + "type": "formula", + "index": 0, + "order": "desc" + } + ] + } + } + ], + "type": "sunburst", + "legend": { + "type": "automatic" + } + }, + "layout": { + "x": 4, + "y": 0, + "width": 3, + "height": 3 + } + }, + { + "id": 2281078232667209, + "definition": { + "title": "Etcd db compaction duration/pause average time", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "millisecond" + } + }, + "alias": "duration", + "formula": "query1 / query2" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:etcd.debugging.mvcc.db.compaction.total.duration.milliseconds.sum{$kube_cluster_name}" + }, + { + "data_source": "metrics", + "name": "query2", + "query": "sum:etcd.debugging.mvcc.db.compaction.total.duration.milliseconds.count{$kube_cluster_name}" + } + ], + "response_format": "timeseries", + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + }, + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query0", + "query": "sum:etcd.debugging.mvcc.db.compaction.pause.duration.milliseconds.sum{$kube_cluster_name}" + }, + { + "data_source": "metrics", + "name": "query1", + "query": "sum:etcd.debugging.mvcc.db.compaction.pause.duration.milliseconds.count{$kube_cluster_name}" + } + ], + "formulas": [ + { + "alias": "pause", + "formula": "query0 / query1", + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "millisecond" + } + } + } + ], + "style": { + "palette": "dog_classic", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 7, + "y": 0, + "width": 5, + "height": 3 + } + }, + { + "id": 7769591245715321, + "definition": { + "title": "Etcd total number of keys.", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "avg:etcd.debugging.mvcc.keys.total{$kube_cluster_name}", + "aggregator": "last" + } + ], + "formulas": [ + { + "formula": "query1" + } + ] + } + ], + "autoscale": true, + "precision": 2, + "timeseries_background": { + "yaxis": { + "include_zero": false + }, + "type": "area" + } + }, + "layout": { + "x": 0, + "y": 3, + "width": 2, + "height": 3 + } + }, + { + "id": 3957160668151665, + "definition": { + "title": "Etcd compacted keys", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:etcd.debugging.mvcc.db.compaction.keys.total{$kube_cluster_name}.as_count()", + "aggregator": "last" + } + ], + "formulas": [ + { + "formula": "query1" + } + ] + } + ], + "autoscale": true, + "precision": 2, + "timeseries_background": { + "yaxis": { + "include_zero": false + }, + "type": "area" + } + }, + "layout": { + "x": 2, + "y": 3, + "width": 2, + "height": 3 + } + }, + { + "id": 1595490164455588, + "definition": { + "title": "Msg received by gRPC service", + "title_size": "16", + "title_align": "left", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:etcd.grpc.server.msg.received.total{$kube_cluster_name} by {grpc_service}.as_count()", + "aggregator": "avg" + } + ], + "style": { + "palette": "datadog16" + }, + "formulas": [ + { + "formula": "query1" + } + ], + "sort": { + "count": 500, + "order_by": [ + { + "type": "formula", + "index": 0, + "order": "desc" + } + ] + } + } + ], + "type": "sunburst", + "legend": { + "type": "automatic" + } + }, + "layout": { + "x": 4, + "y": 3, + "width": 3, + "height": 3 + } + }, + { + "id": 6405943740320479, + "definition": { + "title": "Etcd store writes/reads", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "alias": "writes", + "formula": "query1" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:etcd.debugging.store.writes.total{$kube_cluster_name}.as_count()" + } + ], + "response_format": "timeseries", + "style": { + "palette": "dog_classic", + "order_by": "values", + "order_reverse": false, + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "area" + }, + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query0", + "query": "sum:etcd.debugging.store.reads.total{$kube_cluster_name}.as_count()" + } + ], + "formulas": [ + { + "alias": "reads", + "formula": "query0" + } + ], + "style": { + "palette": "dog_classic", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "area" + } + ] + }, + "layout": { + "x": 7, + "y": 3, + "width": 5, + "height": 3 + } + }, + { + "id": 7037325482874671, + "definition": { + "title": "Total number of bytes in WAL", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "avg:etcd.disk.wal.write.bytes.total{$kube_cluster_name}", + "aggregator": "last" + } + ], + "formulas": [ + { + "formula": "query1" + } + ] + } + ], + "autoscale": true, + "precision": 2, + "timeseries_background": { + "type": "area" + } + }, + "layout": { + "x": 0, + "y": 6, + "width": 2, + "height": 3 + } + }, + { + "id": 3341073619319436, + "definition": { + "title": "Etcd available fd", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "formulas": [ + { + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "percent" + } + }, + "formula": "100 - ((query2 / query1) * 100)" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query2", + "query": "sum:etcd.process.open.fds{$kube_cluster_name}", + "aggregator": "last" + }, + { + "data_source": "metrics", + "name": "query1", + "query": "sum:etcd.process.max.fds{$kube_cluster_name}", + "aggregator": "last" + } + ], + "response_format": "scalar", + "conditional_formats": [ + { + "comparator": ">", + "value": 50, + "palette": "white_on_green" + }, + { + "comparator": "<=", + "value": 50, + "palette": "white_on_yellow" + }, + { + "comparator": "<=", + "value": 10, + "palette": "white_on_red" + } + ] + } + ], + "autoscale": false, + "precision": 2, + "timeseries_background": { + "type": "area", + "yaxis": { + "include_zero": false + } + } + }, + "layout": { + "x": 2, + "y": 6, + "width": 2, + "height": 3 + } + }, + { + "id": 74915216088589, + "definition": { + "title": "DB Storage size", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "avg:kube_apiserver.etcd.db.total_size{$kube_cluster_name}", + "aggregator": "avg" + } + ], + "formulas": [ + { + "formula": "query1", + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "byte_in_binary_bytes_family" + } + } + } + ] + } + ], + "autoscale": true, + "precision": 2, + "timeseries_background": { + "type": "area", + "yaxis": { + "include_zero": false + } + } + }, + "layout": { + "x": 4, + "y": 6, + "width": 3, + "height": 3 + } + }, + { + "id": 1295083390433235, + "definition": { + "title": "Etcd operations type/sec", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:etcd.debugging.mvcc.put.total{$kube_cluster_name}.as_rate()" + } + ], + "formulas": [ + { + "alias": "put", + "formula": "query1" + } + ], + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + }, + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query0", + "query": "sum:etcd.debugging.mvcc.range.total{$kube_cluster_name}.as_rate()" + } + ], + "formulas": [ + { + "alias": "range", + "formula": "query0" + } + ], + "style": { + "palette": "dog_classic", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + }, + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query0", + "query": "sum:etcd.debugging.mvcc.events.total{$kube_cluster_name}.as_rate()" + } + ], + "formulas": [ + { + "alias": "events", + "formula": "query0" + } + ], + "style": { + "palette": "dog_classic", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + }, + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query0", + "query": "sum:etcd.debugging.mvcc.delete.total{$kube_cluster_name}.as_rate()" + } + ], + "formulas": [ + { + "alias": "delete", + "formula": "query0" + } + ], + "style": { + "palette": "dog_classic", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 7, + "y": 6, + "width": 5, + "height": 3 + } + } + ] + }, + "layout": { + "x": 0, + "y": 23, + "width": 12, + "height": 10 + } + }, + { + "id": 7727946338119382, + "definition": { + "title": "Kubernetes scheduler", + "show_title": true, + "type": "group", + "layout_type": "ordered", + "widgets": [ + { + "id": 3489560228035212, + "definition": { + "title": "Agent check success ratio", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:check_run.kube_scheduler.up.ok{$kube_cluster_name}.as_count()", + "aggregator": "last" + }, + { + "data_source": "metrics", + "name": "query2", + "query": "sum:check_run.kube_scheduler.up.critical{$kube_cluster_name}.as_count()", + "aggregator": "last" + } + ], + "conditional_formats": [ + { + "comparator": "<", + "value": 95, + "palette": "white_on_red" + }, + { + "comparator": "<", + "value": 100, + "palette": "white_on_yellow" + }, + { + "comparator": ">=", + "value": 100, + "palette": "white_on_green" + } + ], + "formulas": [ + { + "formula": "query1 / (query1 + query2) * 100", + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "percent" + }, + "unit_scale": { + "type": "canonical_unit", + "unit_name": "percent" + } + } + } + ] + } + ], + "autoscale": false, + "precision": 2, + "timeseries_background": { + "type": "area", + "yaxis": { + "include_zero": false + } + } + }, + "layout": { + "x": 0, + "y": 0, + "width": 2, + "height": 3 + } + }, + { + "id": 4975111239656013, + "definition": { + "title": "Pending pods ratio", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:kube_scheduler.pending_pods{$kube_cluster_name}", + "aggregator": "last" + }, + { + "data_source": "metrics", + "name": "query2", + "query": "sum:kubernetes.pods.running{$kube_cluster_name}", + "aggregator": "last" + } + ], + "conditional_formats": [ + { + "comparator": ">", + "value": 1, + "palette": "white_on_red" + }, + { + "comparator": "<=", + "value": 1, + "palette": "white_on_green" + } + ], + "formulas": [ + { + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "percent" + } + }, + "formula": "(query1 / (query1 + query2)) * 100" + } + ] + } + ], + "autoscale": true, + "precision": 0, + "timeseries_background": { + "yaxis": { + "include_zero": false + }, + "type": "area" + } + }, + "layout": { + "x": 2, + "y": 0, + "width": 2, + "height": 3 + } + }, + { + "id": 8186622066857433, + "definition": { + "title": "Events by Queue and Type", + "title_size": "16", + "title_align": "left", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:kube_scheduler.queue.incoming_pods{$kube_cluster_name} by {queue,event}.as_count()", + "aggregator": "last" + } + ], + "style": { + "palette": "datadog16" + }, + "formulas": [ + { + "formula": "query1" + } + ], + "sort": { + "count": 500, + "order_by": [ + { + "type": "formula", + "index": 0, + "order": "desc" + } + ] + } + } + ], + "type": "sunburst", + "legend": { + "type": "automatic" + } + }, + "layout": { + "x": 4, + "y": 0, + "width": 2, + "height": 3 + } + }, + { + "id": 8129269513646259, + "definition": { + "title": "Average pod scheduling duration by number of attempts", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "second" + } + }, + "formula": "query1 / query2" + } + ], + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:kube_scheduler.scheduling.pod.scheduling_duration.sum{$kube_cluster_name} by {attempts}" + }, + { + "data_source": "metrics", + "name": "query2", + "query": "sum:kube_scheduler.scheduling.pod.scheduling_duration.count{$kube_cluster_name} by {attempts}" + } + ], + "response_format": "timeseries", + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 6, + "y": 0, + "width": 6, + "height": 3 + } + }, + { + "id": 3418343648746570, + "definition": { + "title": "Average scheduling attempt by result", + "title_size": "16", + "title_align": "left", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:kube_scheduler.schedule_attempts{$kube_cluster_name} by {result}.as_count()", + "aggregator": "avg" + } + ], + "style": { + "palette": "datadog16" + }, + "formulas": [ + { + "formula": "query1" + } + ], + "sort": { + "count": 500, + "order_by": [ + { + "type": "formula", + "index": 0, + "order": "desc" + } + ] + } + } + ], + "type": "sunburst", + "legend": { + "type": "automatic" + } + }, + "layout": { + "x": 0, + "y": 3, + "width": 5, + "height": 3 + } + }, + { + "id": 1112928655100049, + "definition": { + "title": "Pod schedule attempt by result", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:kube_scheduler.schedule_attempts{$kube_cluster_name} by {result}.as_count()" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 5, + "y": 3, + "width": 7, + "height": 3 + } + }, + { + "id": 5619828820983304, + "definition": { + "title": "Average HTTP client requests by code", + "title_size": "16", + "title_align": "left", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:kube_scheduler.client.http.requests{$kube_cluster_name} by {code}.as_count()", + "aggregator": "avg" + } + ], + "style": { + "palette": "datadog16" + }, + "formulas": [ + { + "formula": "query1" + } + ], + "sort": { + "count": 500, + "order_by": [ + { + "type": "formula", + "index": 0, + "order": "desc" + } + ] + } + } + ], + "type": "sunburst", + "legend": { + "type": "automatic" + } + }, + "layout": { + "x": 0, + "y": 6, + "width": 5, + "height": 3 + } + }, + { + "id": 5097720695323731, + "definition": { + "title": "Http client requests / sec by code", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:kube_scheduler.client.http.requests{$kube_cluster_name} by {code}.as_rate()" + } + ], + "formulas": [ + { + "formula": "query1", + "number_format": {} + } + ], + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 5, + "y": 6, + "width": 7, + "height": 3 + } + }, + { + "id": 6992991848491610, + "definition": { + "title": "Pending pod by queues", + "title_size": "16", + "title_align": "left", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:kube_scheduler.pending_pods{$kube_cluster_name} by {queue}", + "aggregator": "last" + } + ], + "style": { + "palette": "datadog16" + }, + "formulas": [ + { + "formula": "query1" + } + ], + "sort": { + "count": 500, + "order_by": [ + { + "type": "formula", + "index": 0, + "order": "desc" + } + ] + } + } + ], + "type": "sunburst", + "legend": { + "type": "automatic" + } + }, + "layout": { + "x": 0, + "y": 9, + "width": 5, + "height": 3 + } + }, + { + "id": 3785182674181022, + "definition": { + "title": "Pending pods by queue", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:kube_scheduler.pending_pods{$kube_cluster_name} by {queue}" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 5, + "y": 9, + "width": 7, + "height": 3 + } + } + ] + }, + "layout": { + "x": 0, + "y": 33, + "width": 12, + "height": 13 + } + }, + { + "id": 1736570212005350, + "definition": { + "title": "Kube controller manager", + "show_title": true, + "type": "group", + "layout_type": "ordered", + "widgets": [ + { + "id": 8853295397091082, + "definition": { + "title": "Agent check run success ratio", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:check_run.kube_controller_manager.up.ok{$kube_cluster_name}.as_count()", + "aggregator": "last" + }, + { + "data_source": "metrics", + "name": "query2", + "query": "sum:check_run.kube_controller_manager.up.critical{$kube_cluster_name}.as_count()", + "aggregator": "last" + } + ], + "conditional_formats": [ + { + "comparator": "<", + "value": 95, + "palette": "white_on_red" + }, + { + "comparator": ">", + "value": 100, + "palette": "white_on_yellow" + }, + { + "comparator": ">=", + "value": 100, + "palette": "white_on_green" + } + ], + "formulas": [ + { + "formula": "(query1 / (query1 + query2)) * 100", + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "percent" + }, + "unit_scale": { + "type": "canonical_unit", + "unit_name": "percent" + } + } + } + ] + } + ], + "autoscale": false, + "precision": 2, + "timeseries_background": { + "type": "area", + "yaxis": { + "include_zero": false + } + } + }, + "layout": { + "x": 0, + "y": 0, + "width": 2, + "height": 3 + } + }, + { + "id": 4033566830934375, + "definition": { + "title": "Node evictions", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:kube_controller_manager.nodes.evictions{$kube_cluster_name}.as_count()", + "aggregator": "avg" + } + ], + "conditional_formats": [ + { + "comparator": "=", + "value": 0, + "palette": "white_on_green" + }, + { + "comparator": ">", + "value": 0, + "palette": "white_on_yellow" + } + ], + "formulas": [ + { + "formula": "query1" + } + ] + } + ], + "autoscale": true, + "precision": 0, + "timeseries_background": { + "yaxis": { + "include_zero": false + }, + "type": "area" + } + }, + "layout": { + "x": 2, + "y": 0, + "width": 2, + "height": 3 + } + }, + { + "id": 7350980922272526, + "definition": { + "title": "Nodes unealthy", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "avg:kube_controller_manager.nodes.unhealthy{$kube_cluster_name}", + "aggregator": "last" + } + ], + "conditional_formats": [ + { + "comparator": "=", + "value": 0, + "palette": "white_on_green" + }, + { + "comparator": ">", + "value": 0, + "palette": "white_on_yellow" + } + ], + "formulas": [ + { + "formula": "query1" + } + ] + } + ], + "autoscale": true, + "precision": 0, + "timeseries_background": { + "type": "area", + "yaxis": { + "include_zero": false + } + } + }, + "layout": { + "x": 4, + "y": 0, + "width": 2, + "height": 3 + } + }, + { + "id": 2933948789213781, + "definition": { + "title": "Queue additions by queue type", + "title_size": "16", + "title_align": "left", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:kube_controller_manager.queue.adds{$kube_cluster_name} by {queue}.as_rate()", + "aggregator": "last" + } + ], + "style": { + "palette": "datadog16" + }, + "formulas": [ + { + "formula": "query1" + } + ], + "sort": { + "count": 500, + "order_by": [ + { + "type": "formula", + "index": 0, + "order": "desc" + } + ] + } + } + ], + "type": "sunburst", + "legend": { + "type": "automatic" + } + }, + "layout": { + "x": 6, + "y": 0, + "width": 3, + "height": 3 + } + }, + { + "id": 5748106069798505, + "definition": { + "title": "Queue depth by queue type", + "title_size": "16", + "title_align": "left", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:kube_controller_manager.queue.depth{$kube_cluster_name} by {queue}", + "aggregator": "last" + } + ], + "style": { + "palette": "datadog16" + }, + "formulas": [ + { + "formula": "query1" + } + ], + "sort": { + "count": 500, + "order_by": [ + { + "type": "formula", + "index": 0, + "order": "desc" + } + ] + } + } + ], + "type": "sunburst", + "legend": { + "type": "automatic" + } + }, + "layout": { + "x": 9, + "y": 0, + "width": 3, + "height": 3 + } + }, + { + "id": 1200607541128559, + "definition": { + "title": "Average HTTP client requests by code", + "title_size": "16", + "title_align": "left", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:kube_controller_manager.client.http.requests{$kube_cluster_name} by {code}.as_count()", + "aggregator": "avg" + } + ], + "style": { + "palette": "datadog16" + }, + "formulas": [ + { + "formula": "query1" + } + ], + "sort": { + "count": 500, + "order_by": [ + { + "type": "formula", + "index": 0, + "order": "desc" + } + ] + } + } + ], + "type": "sunburst", + "legend": { + "type": "automatic" + } + }, + "layout": { + "x": 0, + "y": 3, + "width": 4, + "height": 3 + } + }, + { + "id": 3861210944853009, + "definition": { + "title": "Client HTTP requests by code", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:kube_controller_manager.client.http.requests{$kube_cluster_name} by {code}.as_count()" + } + ], + "formulas": [ + { + "formula": "query1" + } + ], + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 4, + "y": 3, + "width": 8, + "height": 3 + } + }, + { + "id": 4477685684369038, + "definition": { + "title": "Average queue process duration by queue type", + "title_size": "16", + "title_align": "left", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:kube_controller_manager.queue.queue_duration.sum{$kube_cluster_name} by {queue}", + "aggregator": "avg" + }, + { + "data_source": "metrics", + "name": "query2", + "query": "sum:kube_controller_manager.queue.queue_duration.count{$kube_cluster_name} by {queue}", + "aggregator": "avg" + } + ], + "style": { + "palette": "datadog16" + }, + "formulas": [ + { + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "second" + } + }, + "formula": "query1 / query2" + } + ], + "sort": { + "count": 500, + "order_by": [ + { + "type": "formula", + "index": 0, + "order": "desc" + } + ] + } + } + ], + "type": "sunburst", + "legend": { + "type": "automatic" + } + }, + "layout": { + "x": 0, + "y": 6, + "width": 4, + "height": 3 + } + }, + { + "id": 384164155943797, + "definition": { + "title": "Average queue duration by queue", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "response_format": "timeseries", + "queries": [ + { + "data_source": "metrics", + "name": "query1", + "query": "sum:kube_controller_manager.queue.queue_duration.sum{$kube_cluster_name} by {queue}" + }, + { + "data_source": "metrics", + "name": "query2", + "query": "sum:kube_controller_manager.queue.queue_duration.count{$kube_cluster_name} by {queue}" + } + ], + "formulas": [ + { + "number_format": { + "unit": { + "type": "canonical_unit", + "unit_name": "second" + } + }, + "formula": "query1 / query2" + } + ], + "style": { + "palette": "dog_classic", + "order_by": "values", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 4, + "y": 6, + "width": 8, + "height": 3 + } + } + ] + }, + "layout": { + "x": 0, + "y": 46, + "width": 12, + "height": 10 + } + } + ], + "template_variables": [ + { + "name": "kube_cluster_name", + "prefix": "kube_cluster_name", + "available_values": [], + "default": "*" + } + ], + "layout_type": "ordered", + "notify_list": [], + "reflow_type": "fixed" +} diff --git a/kubernetes/manifest.json b/kubernetes/manifest.json index 66f52c9d1ae75..d3e88f30e973f 100644 --- a/kubernetes/manifest.json +++ b/kubernetes/manifest.json @@ -64,7 +64,8 @@ "Kubernetes Jobs & Cronjobs Overview": "assets/dashboards/kubernetes_jobs.json", "Kubernetes StatefulSets Overview": "assets/dashboards/kubernetes_statefulsets.json", "Kubernetes Clusters Overview": "assets/dashboards/kubernetes_clusters.json", - "Kubernetes CPU & Memory usage": "assets/dashboards/kubernetes_cpu_memory.json" + "Kubernetes CPU & Memory Usage": "assets/dashboards/kubernetes_cpu_memory.json", + "Kubernetes Control Plane Overview": "assets/dashboards/kubernetes_control_plane.json" }, "monitors": { "Kubernetes Deployment Replicas are failing": "assets/monitors/monitor_deployments_replicas.json", @@ -83,4 +84,4 @@ "unbound_volumes": "assets/saved_views/unbound_volumes.json" } } -} \ No newline at end of file +}