Prometheus 알림 규칙 설정

시스템 성능 및 서비스 안정성을 모니터링하기 위한 Prometheus 알림 규칙 모음입니다. CPU, 메모리, 디스크 사용량과 함께 응답 시간 및 에러율 기반의 알림을 포함합니다.

groups:
- name: Infrastructure
  rules:
  # High CPU Usage
  - alert: HostHighCpuLoad
    expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "High CPU load (instance {{ $labels.instance }})"
      description: "CPU load is > 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

  # High Memory Usage
  - alert: HostHighMemoryUsage
    expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 90
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "High Memory usage (instance {{ $labels.instance }})"
      description: "Memory usage is > 90%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

  # Low Disk Space
  - alert: HostLowDiskSpace
    expr: (node_filesystem_avail_bytes{mountpoint="/"} * 100) / node_filesystem_size_bytes{mountpoint="/"} < 10
    for: 2m
    labels:
      severity: critical
    annotations:
      summary: "Low Disk Space (instance {{ $labels.instance }})"
      description: "Disk space on / is < 10%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

- name: Application
  rules:
  # High HTTP Error Rate
  - alert: HttpHighErrorRate
    expr: rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m]) > 0.05
    for: 2m
    labels:
      severity: critical
    annotations:
      summary: "High HTTP 5xx error rate (instance {{ $labels.instance }})"
      description: "Error rate is > 5%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

  # High Response Latency
  - alert: HttpHighLatency
    expr: histogram_quantile(0.95, sum by (le, path) (rate(http_request_duration_seconds_bucket[5m]))) > 1.0
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: "High HTTP latency (path {{ $labels.path }})"
      description: "95th percentile of latency is > 1s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"