Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
81d4fb3
feat(build.gradle): prometheus 의존성 추가
Kimgyuilli Jan 21, 2026
212a2a3
feat(global): application-monitoring.yaml 추가
Kimgyuilli Jan 21, 2026
8d72446
feat(docker): 모니터링용 docker-compose 작성
Kimgyuilli Jan 21, 2026
d2ec41c
feat(monitoring): prometheus 설정 파일 추가
Kimgyuilli Jan 21, 2026
f0b5b4b
feat(monitoring): Prometheus + Grafana 모니터링 스택 구축
Kimgyuilli Jan 21, 2026
3c918a6
feat(monitoring): jvm 대시보드 패널 구성
Kimgyuilli Jan 21, 2026
b900456
feat(monitoring): Grafana 디스코드 모니터링 구현
Kimgyuilli Jan 21, 2026
2e2eae9
feat(monitoring): docker-compose 모니터링 yml 추가
Kimgyuilli Jan 21, 2026
fd61eaa
feat(grafana): 디스코드 알림 형식 개선
Kimgyuilli Jan 22, 2026
685e0e6
Merge branch 'develop' into 129-feature/prometheus-grafana-monitoring…
Kimgyuilli Jan 22, 2026
f4a9448
feat(prometheus): 실행시 healthCheck 로직 추가
Kimgyuilli Jan 22, 2026
04b468b
feat(monitoring): 대시보드 설정 업데이트
Kimgyuilli Jan 22, 2026
0956988
feat(monitoring): Prometheus 상태 알림 플로우 구축
Kimgyuilli Jan 22, 2026
38658ad
feat(grafana): 대시보드 refresh 시간 조정
Kimgyuilli Jan 22, 2026
25d324e
feat(grafana): threshold 값 조정
Kimgyuilli Jan 22, 2026
e83ba64
feat(grafana): docker-compose 파일 운영 환경별로 분리
Kimgyuilli Jan 22, 2026
21b929b
refactor(grafana): application-monitoring 불필요한 라인 제거
Kimgyuilli Jan 22, 2026
4a0d26b
feat(grafana): 각 룰에 rule name 추가
Kimgyuilli Jan 22, 2026
79af7e7
feat(grafana): 디스코드 응답 개선
Kimgyuilli Jan 22, 2026
8426379
feat(grafana): 디스코드 알림 그룹화 변경
Kimgyuilli Jan 22, 2026
4ea8266
t push:wq
Kimgyuilli Jan 22, 2026
2b619ca
feat(monitoring): docker linux 환경 호환성 개선
Kimgyuilli Jan 22, 2026
630c9b4
Merge branch 'develop' into 129-feature/prometheus-grafana-monitoring…
Kimgyuilli Jan 22, 2026
c0d1397
Merge branch 'develop' into 129-feature/prometheus-grafana-monitoring…
Kimgyuilli Jan 22, 2026
1a4c61c
feat(monitoring): 불필요한 버전 명시 정리
Kimgyuilli Jan 22, 2026
438c506
feat(prometheus): 3.5.1(LTS) 버전으로 업데이트
Kimgyuilli Jan 23, 2026
1154c99
feat(grafana): 11.6.9(LTS) 버전으로 업데이트
Kimgyuilli Jan 23, 2026
18ca48b
Merge branch '129-feature/prometheus-grafana-monitoring-system' of ht…
Kimgyuilli Jan 23, 2026
f34c007
Merge branch 'develop' into 129-feature/prometheus-grafana-monitoring…
Kimgyuilli Jan 23, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ dependencies {
implementation 'net.logstash.logback:logstash-logback-encoder:8.1'
implementation 'com.github.napstr:logback-discord-appender:1.0.0'

// Metrics
implementation 'io.micrometer:micrometer-registry-prometheus'

// Test
testImplementation 'org.springframework.boot:spring-boot-starter-test'
testRuntimeOnly 'org.junit.platform:junit-platform-launcher'
Expand Down
53 changes: 53 additions & 0 deletions docker-compose.monitoring.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
services:
prometheus:
image: prom/prometheus:v3.5.1
container_name: cherrish-prometheus
ports:
- "9090:9090"
volumes:
- ./monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=15d'
- '--web.enable-lifecycle'
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9090/-/healthy"]
interval: 10s
timeout: 5s
retries: 3
start_period: 10s
extra_hosts:
- "host.docker.internal:host-gateway"
networks:
- monitoring
restart: unless-stopped
Comment on lines +1 to +25
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🧹 Nitpick | 🔵 Trivial

프로덕션 안정성을 위해 리소스 제한 추가를 권장합니다.

Prometheus 컨테이너에 메모리/CPU 제한이 없으면, 메트릭 데이터 증가 시 호스트 리소스를 과도하게 사용할 수 있습니다. 특히 15일 retention과 함께 사용하면 메모리 사용량이 점진적으로 증가할 수 있습니다.

♻️ 리소스 제한 추가 예시
  prometheus:
    image: prom/prometheus:v3.5.1
    container_name: cherrish-prometheus
+   deploy:
+     resources:
+       limits:
+         memory: 2G
+         cpus: '1.0'
+       reservations:
+         memory: 512M
    ports:
      - "9090:9090"
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
services:
prometheus:
image: prom/prometheus:v3.5.1
container_name: cherrish-prometheus
ports:
- "9090:9090"
volumes:
- ./monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=15d'
- '--web.enable-lifecycle'
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9090/-/healthy"]
interval: 10s
timeout: 5s
retries: 3
start_period: 10s
extra_hosts:
- "host.docker.internal:host-gateway"
networks:
- monitoring
restart: unless-stopped
services:
prometheus:
image: prom/prometheus:v3.5.1
container_name: cherrish-prometheus
deploy:
resources:
limits:
memory: 2G
cpus: '1.0'
reservations:
memory: 512M
ports:
- "9090:9090"
volumes:
- ./monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=15d'
- '--web.enable-lifecycle'
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9090/-/healthy"]
interval: 10s
timeout: 5s
retries: 3
start_period: 10s
extra_hosts:
- "host.docker.internal:host-gateway"
networks:
- monitoring
restart: unless-stopped
🤖 Prompt for AI Agents
In `@docker-compose.monitoring.yml` around lines 1 - 25, Add explicit CPU and
memory limits for the prometheus service to prevent it from exhausting host
resources: update the prometheus service (service name "prometheus") to include
resource constraints—for Docker Compose v3 use deploy.resources.limits with cpu
and memory (e.g., cpu: "1.0", memory: "2G"), and if supporting older Compose
formats add equivalent mem_limit and cpus entries—so the container has bounded
memory/CPU while retaining the existing command, volumes, healthcheck, networks,
and restart settings.


grafana:
image: grafana/grafana:11.6.9
container_name: cherrish-grafana
ports:
- "3000:3000"
environment:
- GF_SECURITY_ADMIN_USER=admin
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_PASSWORD:-admin}
- GF_USERS_ALLOW_SIGN_UP=false
- DISCORD_MONITORING_WEBHOOK_URL=${DISCORD_MONITORING_WEBHOOK_URL}
volumes:
- ./monitoring/grafana/provisioning:/etc/grafana/provisioning
- grafana_data:/var/lib/grafana
networks:
- monitoring
depends_on:
prometheus:
condition: service_healthy
restart: unless-stopped
Comment on lines +27 to +45
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🧹 Nitpick | 🔵 Trivial

Grafana 컨테이너에도 리소스 제한 추가를 권장합니다.

Grafana도 대시보드 복잡도와 동시 사용자 수에 따라 리소스 사용량이 증가할 수 있습니다.

♻️ 리소스 제한 추가 예시
  grafana:
    image: grafana/grafana:11.6.9
    container_name: cherrish-grafana
+   deploy:
+     resources:
+       limits:
+         memory: 512M
+         cpus: '0.5'
+       reservations:
+         memory: 128M
    ports:
      - "3000:3000"
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
grafana:
image: grafana/grafana:11.6.9
container_name: cherrish-grafana
ports:
- "3000:3000"
environment:
- GF_SECURITY_ADMIN_USER=admin
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_PASSWORD:-admin}
- GF_USERS_ALLOW_SIGN_UP=false
- DISCORD_MONITORING_WEBHOOK_URL=${DISCORD_MONITORING_WEBHOOK_URL}
volumes:
- ./monitoring/grafana/provisioning:/etc/grafana/provisioning
- grafana_data:/var/lib/grafana
networks:
- monitoring
depends_on:
prometheus:
condition: service_healthy
restart: unless-stopped
grafana:
image: grafana/grafana:11.6.9
container_name: cherrish-grafana
deploy:
resources:
limits:
memory: 512M
cpus: '0.5'
reservations:
memory: 128M
ports:
- "3000:3000"
environment:
- GF_SECURITY_ADMIN_USER=admin
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_PASSWORD:-admin}
- GF_USERS_ALLOW_SIGN_UP=false
- DISCORD_MONITORING_WEBHOOK_URL=${DISCORD_MONITORING_WEBHOOK_URL}
volumes:
- ./monitoring/grafana/provisioning:/etc/grafana/provisioning
- grafana_data:/var/lib/grafana
networks:
- monitoring
depends_on:
prometheus:
condition: service_healthy
restart: unless-stopped
🤖 Prompt for AI Agents
In `@docker-compose.monitoring.yml` around lines 27 - 45, The Grafana service
lacks resource limits; update the grafana service block (service named
"grafana", image "grafana/grafana:11.6.9") to include resource constraints by
adding deploy.resources.limits (e.g., cpu and memory) and
deploy.resources.reservations to cap and reserve CPU/memory for the container;
if using plain docker-compose (non-swarm) add equivalent mem_limit/cpu_shares or
use compose v2/3 fields appropriate for your setup so Grafana cannot exhaust
host resources.


volumes:
prometheus_data:
grafana_data:

networks:
monitoring:
driver: bridge
23 changes: 23 additions & 0 deletions monitoring/grafana/provisioning/alerting/contactpoints.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
apiVersion: 1

contactPoints:
- orgId: 1
name: discord-monitoring
receivers:
- uid: discord-monitoring
type: discord
settings:
url: ${DISCORD_MONITORING_WEBHOOK_URL}
use_discord_username: true
avatar_url: "https://grafana.com/static/assets/img/fav32.png"
title: '{{ if eq .Status "firing" }}:rotating_light: ALERT{{ else }}:white_check_mark: RESOLVED{{ end }} ({{ len .Alerts }} alerts)'
message: |
{{ range .Alerts }}
{{ if eq $.Status "firing" }}:red_circle:{{ else }}:green_circle:{{ end }} **{{ .Labels.rulename }}**
:warning: **Severity:** {{ .Labels.severity | toUpper }}
:clipboard: **Summary:** {{ .Annotations.summary }}
:memo: **Detail:** {{ .Annotations.description }}
:link: **View:** {{ .GeneratorURL }}

{{ end }}
disableResolveMessage: false
10 changes: 10 additions & 0 deletions monitoring/grafana/provisioning/alerting/policies.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
apiVersion: 1

policies:
- orgId: 1
receiver: discord-monitoring
group_by:
- grafana_folder
group_wait: 60s
group_interval: 5m
repeat_interval: 4h
161 changes: 161 additions & 0 deletions monitoring/grafana/provisioning/alerting/rules.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
apiVersion: 1

groups:
- orgId: 1
name: cherrish-alerts
folder: Cherrish
interval: 1m
rules:
- uid: high-error-rate
title: High Error Rate
condition: C
data:
- refId: A
relativeTimeRange:
from: 300
to: 0
datasourceUid: prometheus
model:
expr: sum(rate(http_server_requests_seconds_count{application="cherrish", status=~"5.."}[5m])) or vector(0)
intervalMs: 1000
maxDataPoints: 43200
refId: A
- refId: B
relativeTimeRange:
from: 300
to: 0
datasourceUid: prometheus
model:
expr: sum(rate(http_server_requests_seconds_count{application="cherrish"}[5m])) or vector(1)
intervalMs: 1000
maxDataPoints: 43200
refId: B
- refId: C
relativeTimeRange:
from: 300
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0.05
type: gt
operator:
type: and
query:
params:
- C
reducer:
type: last
expression: $A / $B
intervalMs: 1000
maxDataPoints: 43200
refId: C
type: math
noDataState: NoData
execErrState: Error
for: 2m
annotations:
summary: "High error rate detected on Cherrish Server"
description: "Error rate is above 5% for the last 2 minutes"
labels:
severity: critical
rulename: "High Error Rate"
isPaused: false

- uid: high-latency
title: High Latency (P95)
condition: B
data:
- refId: A
relativeTimeRange:
from: 300
to: 0
datasourceUid: prometheus
model:
expr: histogram_quantile(0.95, sum(rate(http_server_requests_seconds_bucket{application="cherrish"}[5m])) by (le))
intervalMs: 1000
maxDataPoints: 43200
refId: A
- refId: B
relativeTimeRange:
from: 300
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 1
type: gt
operator:
type: and
query:
params:
- B
reducer:
type: last
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: B
type: threshold
noDataState: NoData
execErrState: Error
for: 5m
annotations:
summary: "High latency detected on Cherrish Server"
description: "P95 response time is above 1 second for the last 5 minutes"
labels:
severity: warning
rulename: "High Latency (P95)"
isPaused: false

- uid: metrics-collection-health
title: Metrics Collection Down
condition: B
data:
- refId: A
relativeTimeRange:
from: 300
to: 0
datasourceUid: prometheus
model:
expr: up{job="cherrish-server"}
intervalMs: 1000
maxDataPoints: 43200
refId: A
- refId: B
relativeTimeRange:
from: 300
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 1
type: lt
operator:
type: and
query:
params:
- B
reducer:
type: last
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: B
type: threshold
noDataState: NoData
execErrState: Error
for: 2m
annotations:
summary: "Cherrish Server metrics collection is down"
description: "Prometheus cannot scrape metrics from Cherrish Server. The application may be down or unreachable."
labels:
severity: critical
rulename: "Metrics Collection Down"
isPaused: false
11 changes: 11 additions & 0 deletions monitoring/grafana/provisioning/dashboards/dashboard.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
apiVersion: 1

providers:
- name: 'Cherrish Dashboards'
orgId: 1
folder: 'Cherrish'
type: file
disableDeletion: true
updateIntervalSeconds: 120
options:
path: /etc/grafana/provisioning/dashboards/json
Loading