Prometheus monitoring and alerting configuration

4322 views

                  # Prometheus configuration
global:
  scrape_interval: 15s
  evaluation_interval: 15s
  scrape_timeout: 10s

# Recording and alerting rules
rule_files:
  - /etc/prometheus/rules/*.yml

# Alertmanager configuration
alerting:
  alertmanagers:
    - static_configs:
        - targets:
            - alertmanager:9093

# Scrape targets
scrape_configs:
  # Prometheus self-monitoring
  - job_name: prometheus
    static_configs:
      - targets: ['localhost:9090']

  # Application metrics
  - job_name: web-app
    metrics_path: /metrics
    scrape_interval: 10s
    static_configs:
      - targets:
          - app1:3000
          - app2:3000
          - app3:3000
        labels:
          service: web-app
          environment: production

  # Node exporter (system metrics)
  - job_name: node-exporter
    static_configs:
      - targets:
          - node1:9100
          - node2:9100
          - node3:9100

  # PostgreSQL exporter
  - job_name: postgres
    static_configs:
      - targets: ['postgres-exporter:9187']
        labels:
          service: database

  # Redis exporter
  - job_name: redis
    static_configs:
      - targets: ['redis-exporter:9121']

  # Kubernetes Pod discovery
  - job_name: kubernetes-pods
    kubernetes_sd_configs:
      - role: pod
    relabel_configs:
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
        action: keep
        regex: true
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
        action: replace
        target_label: __metrics_path__
        regex: (.+)
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port, __address__]
        action: replace
        regex: (d+);([^:]+):(d+)
        replacement: $2:$1
        target_label: __address__
      - source_labels: [__meta_kubernetes_namespace]
        target_label: namespace
      - source_labels: [__meta_kubernetes_pod_name]
        target_label: pod

                  # Alerting rules
groups:
  - name: application
    rules:
      # High error rate
      - alert: HighErrorRate
        expr: |
          sum(rate(http_requests_total{status=~"5.."}[5m]))
          /
          sum(rate(http_requests_total[5m]))
          > 0.05
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "High error rate detected"
          description: "Error rate is {{ $value | humanizePercentage }} (>5%)"
          runbook: "https://wiki.example.com/runbooks/high-error-rate"

      # High latency
      - alert: HighLatency
        expr: |
          histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))
          > 1.0
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "P95 latency above 1s"
          description: "P95 latency is {{ $value }}s"

      # Pod crash looping
      - alert: PodCrashLooping
        expr: |
          rate(kube_pod_container_status_restarts_total[15m]) * 60 * 5 > 0
        for: 15m
        labels:
          severity: warning
        annotations:
          summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping"

  - name: infrastructure
    rules:
      # High CPU usage
      - alert: HighCPUUsage
        expr: |
          100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
          > 85
        for: 15m
        labels:
          severity: warning
        annotations:
          summary: "High CPU on {{ $labels.instance }}"
          description: "CPU usage is {{ $value }}%"

      # Disk space low
      - alert: DiskSpaceLow
        expr: |
          (node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100
          < 15
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Low disk space on {{ $labels.instance }}"
          description: "{{ $value }}% disk remaining on {{ $labels.mountpoint }}"

      # Memory pressure
      - alert: HighMemoryUsage
        expr: |
          (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100
          > 90
        for: 10m
        labels:
          severity: critical
        annotations:
          summary: "High memory on {{ $labels.instance }}"
          description: "Memory usage is {{ $value }}%"

      # Database connections
      - alert: DatabaseConnectionsHigh
        expr: |
          pg_stat_activity_count / pg_settings_max_connections * 100 > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Database connection pool >80%"

Prometheus collects and stores time-series metrics via a pull model. It scrapes /metrics endpoints at configured intervals. The prometheus.yml defines scrape_configs with target discovery. static_configs list fixed targets while kubernetes_sd_configs auto-discover Pods. PromQL queries metrics—rate() calculates per-second rates, histogram_quantile() computes percentiles. Recording rules pre-compute expensive queries. Alerting rules trigger when conditions hold for a for duration. Alert expressions use PromQL with thresholds. Alertmanager routes alerts to Slack, PagerDuty, or email. Labels classify and group alerts. The group_by and inhibit_rules prevent alert storms. Service monitors define scraping for Prometheus Operator in Kubernetes.

Ryan Nakamura

More from Ryan Nakamura

Prometheus monitoring and alerting configuration

0 Comments

More from Ryan Nakamura