From d8f9e8fbd1e64d4381abc784ca861ceff350d81a Mon Sep 17 00:00:00 2001 From: "Kim Hyunyoung, Abel" Date: Fri, 5 Apr 2024 18:19:45 +0900 Subject: [PATCH] =?UTF-8?q?feat(application-template):=20PrometheusRule=20?= =?UTF-8?q?CR=20=EA=B8=B0=EB=8A=A5=20=EC=B6=94=EA=B0=80=ED=95=98=EA=B8=B0?= =?UTF-8?q?=20(#17)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix: render service monitor CR only the CRD installed * feat: add prometheus rule CR * feat: change rules to alerting_roles * feat: worker, scheduler에 prometheus rule 추가 * feat: trigger actions * feat: update chart version * feat: update readme --- charts/application-template/Chart.yaml | 2 +- charts/application-template/README.md | 12 ++- .../templates/scheduler/prometheus-rule.yaml | 81 +++++++++++++++++++ .../templates/server/prometheus-rule.yaml | 81 +++++++++++++++++++ .../templates/service_monitor.yaml | 2 +- .../templates/worker/prometheus-rule.yaml | 81 +++++++++++++++++++ charts/application-template/values.yaml | 41 ++++++++++ 7 files changed, 295 insertions(+), 5 deletions(-) create mode 100644 charts/application-template/templates/scheduler/prometheus-rule.yaml create mode 100644 charts/application-template/templates/server/prometheus-rule.yaml create mode 100644 charts/application-template/templates/worker/prometheus-rule.yaml diff --git a/charts/application-template/Chart.yaml b/charts/application-template/Chart.yaml index 11cd07a..6e8b411 100644 --- a/charts/application-template/Chart.yaml +++ b/charts/application-template/Chart.yaml @@ -9,4 +9,4 @@ maintainers: - name: modusign url: https://github.com/modusign name: application-template -version: 1.4.2 +version: 1.5.0 diff --git a/charts/application-template/README.md b/charts/application-template/README.md index b2c3e4a..dfa18a6 100644 --- a/charts/application-template/README.md +++ b/charts/application-template/README.md @@ -1,6 +1,6 @@ # application-template -![Version: 1.3.2](https://img.shields.io/badge/Version-1.3.2-informational?style=flat-square) ![AppVersion: v1.0.0](https://img.shields.io/badge/AppVersion-v1.0.0-informational?style=flat-square) +![Version: 1.5.0](https://img.shields.io/badge/Version-1.5.0-informational?style=flat-square) ![AppVersion: v1.0.0](https://img.shields.io/badge/AppVersion-v1.0.0-informational?style=flat-square) A Helm chart for Modusign Applications @@ -33,6 +33,8 @@ Kubernetes: `>=1.23` | global.minReadySeconds | int | `60` | optional field that specifies the minimum number of seconds for which a newly created Pod should be ready without any of its containers crashing | | global.nodeSelector | object | `{}` | Default node selector for all components | | global.observability.datadog | object | `{"admissionController":{"enabled":false}}` | inject datadog admission controller env label | +| global.observability.prometheus | object | `{"serviceMonitor":{"enabled":false,"path":"/metrics","portName":"metrics"}}` | set up additional service port and setup | +| global.observability.prometheus.serviceMonitor | object | `{"enabled":false,"path":"/metrics","portName":"metrics"}` | create Prometheus Operator ServiceMonitor CR | | global.podAnnotations | object | `{}` | Annotations for the all deployed pods | | global.podLabels | object | `{}` | Labels for the all deployed pods | | global.revisionHistoryLimit | int | `3` | Number of old deployment ReplicaSets to retain. The rest will be garbage collected. | @@ -73,6 +75,8 @@ Kubernetes: `>=1.23` | scheduler.istio.virtualServices | list | `[]` | virtualService configuration | | scheduler.lifecycle | object | `{}` | Specify postStart and preStop lifecycle hooks for your container | | scheduler.nodeSelector | object | `{}` (defaults to global.nodeSelector) | [Node selector] | +| scheduler.observability.prometheus.alerting_rules | object | `{"enabled":false,"highCpuUsageThreshold":70,"highMemoryUsageThreshold":70}` | create Prometheus Operator PrometheusRule CR for service container | +| scheduler.observability.prometheus.istio_alerting_rules | object | `{"enabled":false,"highCpuUsageThreshold":70,"highMemoryUsageThreshold":70}` | create Prometheus Operator PrometheusRule CR for istio proxy container | | scheduler.pdb.annotations | object | `{}` | Annotations to be added to scheduler pdb | | scheduler.pdb.enabled | bool | `false` | Deploy a [PodDisruptionBudget] for the scheduler | | scheduler.pdb.labels | object | `{}` | Labels to be added to scheduler pdb | @@ -128,6 +132,8 @@ Kubernetes: `>=1.23` | server.istio.virtualServices | list | `[]` | virtualService configuration | | server.lifecycle | object | `{}` | Specify postStart and preStop lifecycle hooks for your container | | server.nodeSelector | object | `{}` (defaults to global.nodeSelector) | [Node selector] | +| server.observability.prometheus.alerting_rules | object | `{"enabled":false,"highCpuUsageThreshold":70,"highMemoryUsageThreshold":70}` | create Prometheus Operator PrometheusRule CR for service container | +| server.observability.prometheus.istio_alerting_rules | object | `{"enabled":false,"highCpuUsageThreshold":70,"highMemoryUsageThreshold":70}` | create Prometheus Operator PrometheusRule CR for istio proxy container | | server.pdb.annotations | object | `{}` | Annotations to be added to server pdb | | server.pdb.enabled | bool | `true` | Deploy a [PodDisruptionBudget] for the server | | server.pdb.labels | object | `{}` | Labels to be added to server pdb | @@ -181,6 +187,8 @@ Kubernetes: `>=1.23` | worker.istio.virtualServices | list | `[]` | virtualService configuration | | worker.lifecycle | object | `{}` | Specify postStart and preStop lifecycle hooks for your container | | worker.nodeSelector | object | `{}` (defaults to global.nodeSelector) | [Node selector] | +| worker.observability.prometheus.alerting_rules | object | `{"enabled":false,"highCpuUsageThreshold":70,"highMemoryUsageThreshold":70}` | create Prometheus Operator PrometheusRule CR for service container | +| worker.observability.prometheus.istio_alerting_rules | object | `{"enabled":false,"highCpuUsageThreshold":70,"highMemoryUsageThreshold":70}` | create Prometheus Operator PrometheusRule CR for istio proxy container | | worker.pdb.annotations | object | `{}` | Annotations to be added to worker pdb | | worker.pdb.enabled | bool | `false` | Deploy a [PodDisruptionBudget] for the worker | | worker.pdb.labels | object | `{}` | Labels to be added to worker pdb | @@ -207,5 +215,3 @@ Kubernetes: `>=1.23` | worker.volumes | list | `[]` | Additional volumes to the application worker pod | | worker.workload | string | `"deployment"` | set deployment kind to Rollouts rollout: enabled : false | ----------------------------------------------- -Autogenerated from chart metadata using [helm-docs v1.11.3](https://github.com/norwoodj/helm-docs/releases/v1.11.3) diff --git a/charts/application-template/templates/scheduler/prometheus-rule.yaml b/charts/application-template/templates/scheduler/prometheus-rule.yaml new file mode 100644 index 0000000..40b577b --- /dev/null +++ b/charts/application-template/templates/scheduler/prometheus-rule.yaml @@ -0,0 +1,81 @@ +{{- if .Capabilities.APIVersions.Has "monitoring.coreos.com/v1/PrometheusRule" }} +{{- if and .Values.scheduler.enabled (or .Values.scheduler.observability.prometheus.alerting_rules.enabled .Values.scheduler.observability.prometheus.istio_alerting_rules.enabled) }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ template "application.scheduler.name" . }} + namespace: {{ .Release.Namespace }} +spec: + groups: + {{- if .Values.scheduler.observability.prometheus.alerting_rules.enabled }} + - name: ServiceContainerResourceUsage + alerting_rules: + - alert: "HighServiceContainerCPUUsage" + expr: | + avg( + rate(container_cpu_usage_seconds_total{ container={{ .Values.scheduler.name | quote }} }[2m]) * on(pod) group_left kube_pod_labels{ label_app_kubernetes_io_name={{ include "application.scheduler.name" . | quote }} } + / on(pod) + (kube_pod_container_resource_limits{ resource="cpu", container={{ .Values.scheduler.name | quote }} }) + ) + * 100 + > {{ .Values.scheduler.observability.prometheus.alerting_rules.highCpuUsageThreshold }} + for: 5m + labels: + severity: critical + annotations: + summary: "[{{ include "application.scheduler.name" . | title }}] High CPU usage" + description: "[{{ include "application.scheduler.name" . | title }}] 서비스의 최근 CPU 사용량이 {{ .Values.scheduler.observability.prometheus.alerting_rules.highCpuUsageThreshold }}% 이상이 되었습니다. 현재값: {{`{{ .Value | humanize }}`}}%" + + - alert: HighServiceContainerMemoryUsage + expr: | + avg( + (container_memory_rss{ container={{ .Values.scheduler.name | quote }} } * on(pod) group_left kube_pod_labels{ label_app_kubernetes_io_name={{ include "application.scheduler.name" . | quote }} }) + / on(pod) + (kube_pod_container_resource_limits{ resource="memory", container={{ .Values.scheduler.name | quote }} }) + ) * 100 + > {{ .Values.scheduler.observability.prometheus.alerting_rules.highMemoryUsageThreshold }} + for: 5m + labels: + service: {{ include "application.scheduler.name" . | quote }} + severity: critical + annotations: + summary: "[{{ include "application.scheduler.name" . | title }}] High memory usage" + description: "[{{ include "application.scheduler.name" . | title }}] 서비스의 최근 메모리 사용량이 {{ .Values.scheduler.observability.prometheus.alerting_rules.highMemoryUsageThreshold }}% 이상이 되었습니다. 현재값: {{`{{ .Value | humanize }}`}}%" + {{- end }} + {{- if .Values.scheduler.observability.prometheus.istio_alerting_rules.enabled }} + - name: IstioContainerResourceUsage + alerting_rules: + - alert: "HighIstioContainerCPUUsage" + expr: | + avg( + rate(container_cpu_usage_seconds_total{ container="istio-proxy" }[2m]) * on(pod) group_left kube_pod_labels{ label_app_kubernetes_io_name={{ include "application.scheduler.name" . | quote }} } + / on(pod) + (kube_pod_container_resource_limits{ resource="cpu", container="istio-proxy" }) + ) + * 100 + > {{ .Values.scheduler.observability.prometheus.istio_alerting_rules.highCpuUsageThreshold }} + for: 5m + labels: + severity: critical + annotations: + summary: "[{{ include "application.scheduler.name" . | title }}][istio-proxy] High CPU usage" + description: "[{{ include "application.scheduler.name" . | title }}][istio-proxy] 서비스의 최근 CPU 사용량이 {{ .Values.scheduler.observability.prometheus.istio_alerting_rules.highCpuUsageThreshold }}% 이상이 되었습니다. 현재값: {{`{{ .Value | humanize }}`}}%" + + - alert: HighIstioContainerMemoryUsage + expr: | + avg( + (container_memory_rss{ container="istio-proxy" } * on(pod) group_left kube_pod_labels{ label_app_kubernetes_io_name={{ include "application.scheduler.name" . | quote }} }) + / on(pod) + (kube_pod_container_resource_limits{ resource="memory", container="istio-proxy" }) + ) * 100 + > {{ .Values.scheduler.observability.prometheus.istio_alerting_rules.highMemoryUsageThreshold }} + for: 5m + labels: + service: {{ include "application.scheduler.name" . | quote }} + severity: critical + annotations: + summary: "[{{ include "application.scheduler.name" . | title }}][istio-proxy] High memory usage" + description: "[{{ include "application.scheduler.name" . | title }}][istio-proxy] 서비스의 최근 메모리 사용량이 {{ .Values.scheduler.observability.prometheus.istio_alerting_rules.highMemoryUsageThreshold }}% 이상이 되었습니다. 현재값: {{`{{ .Value | humanize }}`}}%" + {{- end }} +{{- end }} +{{- end }} diff --git a/charts/application-template/templates/server/prometheus-rule.yaml b/charts/application-template/templates/server/prometheus-rule.yaml new file mode 100644 index 0000000..2f12afa --- /dev/null +++ b/charts/application-template/templates/server/prometheus-rule.yaml @@ -0,0 +1,81 @@ +{{- if .Capabilities.APIVersions.Has "monitoring.coreos.com/v1/PrometheusRule" }} +{{- if and .Values.server.enabled (or .Values.server.observability.prometheus.rules.enabled .Values.server.observability.prometheus.istio_rules.enabled) }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ template "application.server.name" . }} + namespace: {{ .Release.Namespace }} +spec: + groups: + {{- if .Values.server.observability.prometheus.rules.enabled }} + - name: ServiceContainerResourceUsage + rules: + - alert: "HighServiceContainerCPUUsage" + expr: | + avg( + rate(container_cpu_usage_seconds_total{ container={{ .Values.server.name | quote }} }[2m]) * on(pod) group_left kube_pod_labels{ label_app_kubernetes_io_name={{ include "application.server.name" . | quote }} } + / on(pod) + (kube_pod_container_resource_limits{ resource="cpu", container={{ .Values.server.name | quote }} }) + ) + * 100 + > {{ .Values.server.observability.prometheus.rules.highCpuUsageThreshold }} + for: 5m + labels: + severity: critical + annotations: + summary: "[{{ include "application.server.name" . | title }}] High CPU usage" + description: "[{{ include "application.server.name" . | title }}] 서비스의 최근 CPU 사용량이 {{ .Values.server.observability.prometheus.rules.highCpuUsageThreshold }}% 이상이 되었습니다. 현재값: {{`{{ .Value | humanize }}`}}%" + + - alert: HighServiceContainerMemoryUsage + expr: | + avg( + (container_memory_rss{ container={{ .Values.server.name | quote }} } * on(pod) group_left kube_pod_labels{ label_app_kubernetes_io_name={{ include "application.server.name" . | quote }} }) + / on(pod) + (kube_pod_container_resource_limits{ resource="memory", container={{ .Values.server.name | quote }} }) + ) * 100 + > {{ .Values.server.observability.prometheus.rules.highMemoryUsageThreshold }} + for: 5m + labels: + service: {{ include "application.server.name" . | quote }} + severity: critical + annotations: + summary: "[{{ include "application.server.name" . | title }}] High memory usage" + description: "[{{ include "application.server.name" . | title }}] 서비스의 최근 메모리 사용량이 {{ .Values.server.observability.prometheus.rules.highMemoryUsageThreshold }}% 이상이 되었습니다. 현재값: {{`{{ .Value | humanize }}`}}%" + {{- end }} + {{- if .Values.server.observability.prometheus.istio_rules.enabled }} + - name: IstioContainerResourceUsage + rules: + - alert: "HighIstioContainerCPUUsage" + expr: | + avg( + rate(container_cpu_usage_seconds_total{ container="istio-proxy" }[2m]) * on(pod) group_left kube_pod_labels{ label_app_kubernetes_io_name={{ include "application.server.name" . | quote }} } + / on(pod) + (kube_pod_container_resource_limits{ resource="cpu", container="istio-proxy" }) + ) + * 100 + > {{ .Values.server.observability.prometheus.istio_rules.highCpuUsageThreshold }} + for: 5m + labels: + severity: critical + annotations: + summary: "[{{ include "application.server.name" . | title }}][istio-proxy] High CPU usage" + description: "[{{ include "application.server.name" . | title }}][istio-proxy] 서비스의 최근 CPU 사용량이 {{ .Values.server.observability.prometheus.istio_rules.highCpuUsageThreshold }}% 이상이 되었습니다. 현재값: {{`{{ .Value | humanize }}`}}%" + + - alert: HighIstioContainerMemoryUsage + expr: | + avg( + (container_memory_rss{ container="istio-proxy" } * on(pod) group_left kube_pod_labels{ label_app_kubernetes_io_name={{ include "application.server.name" . | quote }} }) + / on(pod) + (kube_pod_container_resource_limits{ resource="memory", container="istio-proxy" }) + ) * 100 + > {{ .Values.server.observability.prometheus.istio_rules.highMemoryUsageThreshold }} + for: 5m + labels: + service: {{ include "application.server.name" . | quote }} + severity: critical + annotations: + summary: "[{{ include "application.server.name" . | title }}][istio-proxy] High memory usage" + description: "[{{ include "application.server.name" . | title }}][istio-proxy] 서비스의 최근 메모리 사용량이 {{ .Values.server.observability.prometheus.istio_rules.highMemoryUsageThreshold }}% 이상이 되었습니다. 현재값: {{`{{ .Value | humanize }}`}}%" + {{- end }} +{{- end }} +{{- end }} diff --git a/charts/application-template/templates/service_monitor.yaml b/charts/application-template/templates/service_monitor.yaml index 8d933fd..abad42c 100644 --- a/charts/application-template/templates/service_monitor.yaml +++ b/charts/application-template/templates/service_monitor.yaml @@ -1,4 +1,4 @@ -{{- if .Values.global.observability.prometheus.serviceMonitor.enabled }} +{{- if and .Values.global.observability.prometheus.serviceMonitor.enabled (.Capabilities.APIVersions.Has "monitoring.coreos.com/v1/ServiceMonitor") }} apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: diff --git a/charts/application-template/templates/worker/prometheus-rule.yaml b/charts/application-template/templates/worker/prometheus-rule.yaml new file mode 100644 index 0000000..0fd7e79 --- /dev/null +++ b/charts/application-template/templates/worker/prometheus-rule.yaml @@ -0,0 +1,81 @@ +{{- if .Capabilities.APIVersions.Has "monitoring.coreos.com/v1/PrometheusRule" }} +{{- if and .Values.worker.enabled (or .Values.worker.observability.prometheus.alerting_rules.enabled .Values.worker.observability.prometheus.istio_alerting_rules.enabled) }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ template "application.worker.name" . }} + namespace: {{ .Release.Namespace }} +spec: + groups: + {{- if .Values.worker.observability.prometheus.alerting_rules.enabled }} + - name: ServiceContainerResourceUsage + alerting_rules: + - alert: "HighServiceContainerCPUUsage" + expr: | + avg( + rate(container_cpu_usage_seconds_total{ container={{ .Values.worker.name | quote }} }[2m]) * on(pod) group_left kube_pod_labels{ label_app_kubernetes_io_name={{ include "application.worker.name" . | quote }} } + / on(pod) + (kube_pod_container_resource_limits{ resource="cpu", container={{ .Values.worker.name | quote }} }) + ) + * 100 + > {{ .Values.worker.observability.prometheus.alerting_rules.highCpuUsageThreshold }} + for: 5m + labels: + severity: critical + annotations: + summary: "[{{ include "application.worker.name" . | title }}] High CPU usage" + description: "[{{ include "application.worker.name" . | title }}] 서비스의 최근 CPU 사용량이 {{ .Values.worker.observability.prometheus.alerting_rules.highCpuUsageThreshold }}% 이상이 되었습니다. 현재값: {{`{{ .Value | humanize }}`}}%" + + - alert: HighServiceContainerMemoryUsage + expr: | + avg( + (container_memory_rss{ container={{ .Values.worker.name | quote }} } * on(pod) group_left kube_pod_labels{ label_app_kubernetes_io_name={{ include "application.worker.name" . | quote }} }) + / on(pod) + (kube_pod_container_resource_limits{ resource="memory", container={{ .Values.worker.name | quote }} }) + ) * 100 + > {{ .Values.worker.observability.prometheus.alerting_rules.highMemoryUsageThreshold }} + for: 5m + labels: + service: {{ include "application.worker.name" . | quote }} + severity: critical + annotations: + summary: "[{{ include "application.worker.name" . | title }}] High memory usage" + description: "[{{ include "application.worker.name" . | title }}] 서비스의 최근 메모리 사용량이 {{ .Values.worker.observability.prometheus.alerting_rules.highMemoryUsageThreshold }}% 이상이 되었습니다. 현재값: {{`{{ .Value | humanize }}`}}%" + {{- end }} + {{- if .Values.worker.observability.prometheus.istio_alerting_rules.enabled }} + - name: IstioContainerResourceUsage + alerting_rules: + - alert: "HighIstioContainerCPUUsage" + expr: | + avg( + rate(container_cpu_usage_seconds_total{ container="istio-proxy" }[2m]) * on(pod) group_left kube_pod_labels{ label_app_kubernetes_io_name={{ include "application.worker.name" . | quote }} } + / on(pod) + (kube_pod_container_resource_limits{ resource="cpu", container="istio-proxy" }) + ) + * 100 + > {{ .Values.worker.observability.prometheus.istio_alerting_rules.highCpuUsageThreshold }} + for: 5m + labels: + severity: critical + annotations: + summary: "[{{ include "application.worker.name" . | title }}][istio-proxy] High CPU usage" + description: "[{{ include "application.worker.name" . | title }}][istio-proxy] 서비스의 최근 CPU 사용량이 {{ .Values.worker.observability.prometheus.istio_alerting_rules.highCpuUsageThreshold }}% 이상이 되었습니다. 현재값: {{`{{ .Value | humanize }}`}}%" + + - alert: HighIstioContainerMemoryUsage + expr: | + avg( + (container_memory_rss{ container="istio-proxy" } * on(pod) group_left kube_pod_labels{ label_app_kubernetes_io_name={{ include "application.worker.name" . | quote }} }) + / on(pod) + (kube_pod_container_resource_limits{ resource="memory", container="istio-proxy" }) + ) * 100 + > {{ .Values.worker.observability.prometheus.istio_alerting_rules.highMemoryUsageThreshold }} + for: 5m + labels: + service: {{ include "application.worker.name" . | quote }} + severity: critical + annotations: + summary: "[{{ include "application.worker.name" . | title }}][istio-proxy] High memory usage" + description: "[{{ include "application.worker.name" . | title }}][istio-proxy] 서비스의 최근 메모리 사용량이 {{ .Values.worker.observability.prometheus.istio_alerting_rules.highMemoryUsageThreshold }}% 이상이 되었습니다. 현재값: {{`{{ .Value | humanize }}`}}%" + {{- end }} +{{- end }} +{{- end }} diff --git a/charts/application-template/values.yaml b/charts/application-template/values.yaml index e5ae4a9..78c288c 100644 --- a/charts/application-template/values.yaml +++ b/charts/application-template/values.yaml @@ -354,6 +354,19 @@ server: # -- destinationRule configuration destinationRules: [] + observability: + prometheus: + # -- create Prometheus Operator PrometheusRule CR for service container + alerting_rules: + enabled: false + highCpuUsageThreshold: 70 + highMemoryUsageThreshold: 70 + # -- create Prometheus Operator PrometheusRule CR for istio proxy container + istio_alerting_rules: + enabled: false + highCpuUsageThreshold: 70 + highMemoryUsageThreshold: 70 + ## Worker worker: enabled: false @@ -598,6 +611,20 @@ worker: # -- destinationRule configuration destinationRules: [] + observability: + prometheus: + # -- create Prometheus Operator PrometheusRule CR for service container + alerting_rules: + enabled: false + highCpuUsageThreshold: 70 + highMemoryUsageThreshold: 70 + # -- create Prometheus Operator PrometheusRule CR for istio proxy container + istio_alerting_rules: + enabled: false + highCpuUsageThreshold: 70 + highMemoryUsageThreshold: 70 + + ## Scheduler scheduler: enabled: false @@ -844,3 +871,17 @@ scheduler: virtualServices: [] # -- destinationRule configuration destinationRules: [] + + observability: + prometheus: + # -- create Prometheus Operator PrometheusRule CR for service container + alerting_rules: + enabled: false + highCpuUsageThreshold: 70 + highMemoryUsageThreshold: 70 + # -- create Prometheus Operator PrometheusRule CR for istio proxy container + istio_alerting_rules: + enabled: false + highCpuUsageThreshold: 70 + highMemoryUsageThreshold: 70 +