Prometheus 알림 규칙 설정
시스템 성능 및 서비스 안정성을 모니터링하기 위한 Prometheus 알림 규칙 모음입니다. CPU, 메모리, 디스크 사용량과 함께 응답 시간 및 에러율 기반의 알림을 포함합니다.
groups:
- name: Infrastructure
rules:
# High CPU Usage
- alert: HostHighCpuLoad
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU load (instance {{ $labels.instance }})"
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# High Memory Usage
- alert: HostHighMemoryUsage
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 90
for: 5m
labels:
severity: warning
annotations:
summary: "High Memory usage (instance {{ $labels.instance }})"
description: "Memory usage is > 90%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Low Disk Space
- alert: HostLowDiskSpace
expr: (node_filesystem_avail_bytes{mountpoint="/"} * 100) / node_filesystem_size_bytes{mountpoint="/"} < 10
for: 2m
labels:
severity: critical
annotations:
summary: "Low Disk Space (instance {{ $labels.instance }})"
description: "Disk space on / is < 10%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- name: Application
rules:
# High HTTP Error Rate
- alert: HttpHighErrorRate
expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05
for: 2m
labels:
severity: critical
annotations:
summary: "High HTTP 5xx error rate (instance {{ $labels.instance }})"
description: "Error rate is > 5%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# High Response Latency
- alert: HttpHighLatency
expr: histogram_quantile(0.95, sum by (le, path) (rate(http_request_duration_seconds_bucket[5m]))) > 1.0
for: 5m
labels:
severity: warning
annotations:
summary: "High HTTP latency (path {{ $labels.path }})"
description: "95th percentile of latency is > 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"