Add postgres-exporter alerts

DTLP · DTLP · commit fc83e6a063b0 · 2025-09-24T16:49:09.000+01:00
diff --git a/postgres/system-alerts.yaml.tmpl b/postgres/system-alerts.yaml.tmpl
@@ -0,0 +1,36 @@
+# PROMETHEUS RULES
+# DO NOT REMOVE line above, used in `pre-commit` hook
+
+groups:
+  - name: postgres-exporter
+    rules:
+      - alert: "PostgresExporterDown"
+        expr: |
+          job="postgres-exporter" == 0
+        for: 15m
+        labels:
+          team: infra
+        annotations:
+          summary: "Postgres Exporter is down"
+          impact: "Postgres instances are not monitored"
+          qonto_runbook: https://qonto.github.io/database-monitoring-framework/latest/runbooks/postgresql/SQLExporterDown
+          action: |
+            Check if postgres-exporter is running in sys-prom namespace.
+            Check the logs.
+            Restart pods.
+
+      - alert: "PostgresExporterScrapingLimit"
+        expr: |
+          avg_over_time(pg_exporter_last_scrape_duration_seconds{job="postgres-exporter", instance!=""}[10m]) > 30
+        for: 5m
+        labels:
+          alerttype: stock
+          alertgroup: Postgres
+        annotations:
+          summary: "Postgres Exporter scraping is taking a long time"
+          impact: "Postgres instances are not monitored,"
+          runbook_url: https://qonto.github.io/database-monitoring-framework/latest/runbooks/postgresql/SQLExporterScrapingLimit
+          action: |
+            Check postgres-exporter logs and resource usage.
+            Check postgres database for any long-running queries.
+            Restart pods.
diff --git a/postgres/team-alerts.yaml.tmpl b/postgres/team-alerts.yaml.tmpl
@@ -0,0 +1,133 @@
+# PROMETHEUS RULES
+# DO NOT REMOVE line above, used in `pre-commit` hook
+
+groups:
+    # Uses `uw_rds_owner_team` recording rule created in `system-alerts/rds` for team detection.
+    # Based on https://github.com/qonto/database-monitoring-framework/blob/main/charts/prometheus-postgresql-alerts/values.yaml
+  - name: Postgres
+    rules:
+      - alert: "PostgreSQLInactiveLogicalReplicationSlot"
+        expr: |
+          max by (target, slot_name) (pg_replication_slots_active{slot_type="logical"}) < 1
+          + on (dbidentifier) group_left (team) uw_rds_owner_team
+        for: 10m
+        labels:
+          alerttype: stock
+          alertgroup: Postgres
+        annotations:
+          summary: "Logical replication slot {{ $labels.slot_name }} on {{ $labels.target }} is inactive"
+          impact: "Potential disk space saturation and replication slot no longer being usable."
+          qonto_runbook: https://qonto.github.io/database-monitoring-framework/latest/runbooks/postgresql/PostgreSQLInactiveLogicalReplicationSlot
+          action: |
+            Check the replication slot disk space consumption trend.
+            Identify the non-running logical replication slot.
+            nvestigate and fix the replication slot client.
+
+      - alert: "PostgreSQLInactivePhysicalReplicationSlot"
+        expr: |
+          max by (target, slot_name) (pg_replication_slots_active{slot_type="physical"}) < 1
+          + on (dbidentifier) group_left (team) uw_rds_owner_team
+        for: 10m
+        labels:
+          alerttype: stock
+          alertgroup: Postgres
+        annotations:
+          summary: "Physical replication slot {{ $labels.slot_name }} on {{ $labels.target }} is inactive"
+          impact: "Potential disk space saturation and replication slot no longer being usable."
+          qonto_runbook: https://qonto.github.io/database-monitoring-framework/latest/runbooks/postgresql/PostgreSQLInactivePhysicalReplicationSlot
+          action: |
+            Check the replication slot disk space consumption trend.
+            Check replica lag and instance logs.
+            Increase disk space on the primary instance if necessary.
+
+      - alert: "PostgreSQLInvalidIndex"
+        # pint disable promql/series
+        expr: |
+          count by (datname, relname, indexrelname, server) (
+          pg_stat_user_indexes_idx_scan{indisvalid="false"}
+          ) > 0
+          + on (dbidentifier) group_left (team) uw_rds_owner_team
+        for: 1h
+        labels:
+          alerttype: stock
+          alertgroup: Postgres
+        annotations:
+          summary:
+            "Index {{ $labels.indexrelname }} of {{ $labels.relname }} table on {{ $labels.datname
+            }} database on {{ $labels.target }} is invalid"
+          impact: "PostgreSQL does not use the index for query execution, which could degrade query performances."
+          qonto_runbook: https://qonto.github.io/database-monitoring-framework/latest/runbooks/postgresql/PostgreSQLInvalidIndex
+          action: |
+            Diagnose the root cause.
+            Delete and recreate the index.
+
+      - alert: "PostgreSQLLongRunningQueries"
+        expr: |
+          pg_long_running_transactions_oldest_timestamp_seconds > 1800
+          + on (dbidentifier) group_left (team) uw_rds_owner_team
+        for: 1m
+        labels:
+          alerttype: stock
+          alertgroup: Postgres
+        annotations:
+          summary: "Long running query on {{ $labels.instance }} for >30 minutes."
+          impact: "Potential block on other queries, WAL file rotation and vacuum operations."
+          qonto_runbook: https://qonto.github.io/database-monitoring-framework/latest/runbooks/postgresql/PostgreSQLLongRunningQueries
+          action: |
+            Identify and diagnose the blocking queries.
+            Terminate these queries if you are sure that is safe to do.
+
+      - alert: "PostgreSQLMaxConnections"
+        expr: |
+          max by (target) (pg_stat_connections_count)
+          * 100
+          / max by (target) (pg_settings_max_connections)
+          > 80
+          + on (dbidentifier) group_left (team) uw_rds_owner_team
+        for: 10m
+        labels:
+          alerttype: stock
+          alertgroup: Postgres
+        annotations:
+          summary: "{{ $labels.target }} uses >80% of the maximum database connections"
+          impact: "New clients might not be able to connect."
+          qonto_runbook: https://qonto.github.io/database-monitoring-framework/latest/runbooks/postgresql/PostgreSQLMaxConnections
+          action: |
+            Reduce number of clients.
+            Increase max_connections (check memory first!).
+
+      - alert: "PostgreSQLReplicationSlotStorageLimit"
+        expr: |
+          max by (target, slot_name) (pg_replication_slots_available_storage_percent{}) < 20
+          + on (dbidentifier) group_left (team) uw_rds_owner_team
+        for: 5m
+        labels:
+          alerttype: stock
+          alertgroup: Postgres
+        annotations:
+          summary: "{{ $labels.slot_name }} on {{ $labels.target }} is close to its storage limit"
+          impact: "Potential disk space saturation and replication slot no longer being usable."
+          qonto_runbook: https://qonto.github.io/database-monitoring-framework/latest/runbooks/postgresql/PostgreSQLReplicationSlotStorageLimit
+          action: |
+            Check replication slot client logs and performances.
+            Correct the root cause on replication slot client.
+            Increase max_slot_wal_keep_size to allow more disk space for the replication slot (check free storage first!).
+            Increase server storage.
+
+      - alert: "PostgresExporterMissingTarget"
+        expr: |
+          min(up{job="postgres-exporter", instance!=""}) by (instance) == 0
+          + on (dbidentifier) group_left (team) uw_rds_owner_team
+        for: 15m
+        labels:
+          alerttype: stock
+          alertgroup: Postgres
+        annotations:
+          summary: "Postgres Exporter scrape for {{ $labels.target }} failed"
+          impact: "{{ $labels.target }} instance is not monitored."
+          qonto_runbook: https://qonto.github.io/database-monitoring-framework/latest/runbooks/postgresql/SQLExporterMissingTarget
+          action: |
+            Check postgres-exporter logs in sys-prom namespace.
+            Check if the postgres instance is down.
+            Check postgres database connection logs.
+            Check if there's an issue with the prometheus_postgres_exporter user.