# Prometheus configuration
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_timeout: 10s
# Recording and alerting rules
rule_files:
- /etc/prometheus/rules/*.yml
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
# Scrape targets
scrape_configs:
# Prometheus self-monitoring
- job_name: prometheus
static_configs:
- targets: ['localhost:9090']
# Application metrics
- job_name: web-app
metrics_path: /metrics
scrape_interval: 10s
static_configs:
- targets:
- app1:3000
- app2:3000
- app3:3000
labels:
service: web-app
environment: production
# Node exporter (system metrics)
- job_name: node-exporter
static_configs:
- targets:
- node1:9100
- node2:9100
- node3:9100
# PostgreSQL exporter
- job_name: postgres
static_configs:
- targets: ['postgres-exporter:9187']
labels:
service: database
# Redis exporter
- job_name: redis
static_configs:
- targets: ['redis-exporter:9121']
# Kubernetes Pod discovery
- job_name: kubernetes-pods
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port, __address__]
action: replace
regex: (d+);([^:]+):(d+)
replacement: $2:$1
target_label: __address__
- source_labels: [__meta_kubernetes_namespace]
target_label: namespace
- source_labels: [__meta_kubernetes_pod_name]
target_label: pod
# Alerting rules
groups:
- name: application
rules:
# High error rate
- alert: HighErrorRate
expr: |
sum(rate(http_requests_total{status=~"5.."}[5m]))
/
sum(rate(http_requests_total[5m]))
> 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value | humanizePercentage }} (>5%)"
runbook: "https://wiki.example.com/runbooks/high-error-rate"
# High latency
- alert: HighLatency
expr: |
histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))
> 1.0
for: 10m
labels:
severity: warning
annotations:
summary: "P95 latency above 1s"
description: "P95 latency is {{ $value }}s"
# Pod crash looping
- alert: PodCrashLooping
expr: |
rate(kube_pod_container_status_restarts_total[15m]) * 60 * 5 > 0
for: 15m
labels:
severity: warning
annotations:
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping"
- name: infrastructure
rules:
# High CPU usage
- alert: HighCPUUsage
expr: |
100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
> 85
for: 15m
labels:
severity: warning
annotations:
summary: "High CPU on {{ $labels.instance }}"
description: "CPU usage is {{ $value }}%"
# Disk space low
- alert: DiskSpaceLow
expr: |
(node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100
< 15
for: 5m
labels:
severity: critical
annotations:
summary: "Low disk space on {{ $labels.instance }}"
description: "{{ $value }}% disk remaining on {{ $labels.mountpoint }}"
# Memory pressure
- alert: HighMemoryUsage
expr: |
(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100
> 90
for: 10m
labels:
severity: critical
annotations:
summary: "High memory on {{ $labels.instance }}"
description: "Memory usage is {{ $value }}%"
# Database connections
- alert: DatabaseConnectionsHigh
expr: |
pg_stat_activity_count / pg_settings_max_connections * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: "Database connection pool >80%"
Prometheus collects and stores time-series metrics via a pull model. It scrapes /metrics endpoints at configured intervals. The prometheus.yml defines scrape_configs with target discovery. static_configs list fixed targets while kubernetes_sd_configs auto-discover Pods. PromQL queries metrics—rate() calculates per-second rates, histogram_quantile() computes percentiles. Recording rules pre-compute expensive queries. Alerting rules trigger when conditions hold for a for duration. Alert expressions use PromQL with thresholds. Alertmanager routes alerts to Slack, PagerDuty, or email. Labels classify and group alerts. The group_by and inhibit_rules prevent alert storms. Service monitors define scraping for Prometheus Operator in Kubernetes.