#!/usr/bin/env bash
set -euo pipefail
# ==========================================================
# Production Incident Response Runbook
# ==========================================================
# Usage: ./incident-runbook.sh [triage|cpu|memory|disk|network|k8s]
# ==========================================================
NAMESPACE="${NAMESPACE:-production}"
APP="${APP:-myapp}"
# Colors for output
RED='\033[0;31m'
YELLOW='\033[1;33m'
GREEN='\033[0;32m'
NC='\033[0m'
header() { echo -e "\n${YELLOW}=== $1 ===${NC}\n"; }
ok() { echo -e "${GREEN}[OK]${NC} $1"; }
warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
fail() { echo -e "${RED}[FAIL]${NC} $1"; }
# ==========================================================
# TRIAGE: Quick health overview
# ==========================================================
triage() {
header "System Overview"
echo "Timestamp: $(date -u +%Y-%m-%dT%H:%M:%SZ)"
echo "Hostname: $(hostname)"
echo "Uptime: $(uptime)"
header "Resource Usage"
echo "--- CPU ---"
top -bn1 | head -5
echo -e "\n--- Memory ---"
free -h
echo -e "\n--- Disk ---"
df -h / /var /tmp 2>/dev/null
echo -e "\n--- Load Average ---"
cat /proc/loadavg
header "Top Processes (CPU)"
ps aux --sort=-%cpu | head -10
header "Top Processes (Memory)"
ps aux --sort=-%mem | head -10
header "Recent OOM Kills"
dmesg -T 2>/dev/null | grep -i "oom\|killed" | tail -5 || echo "None found"
header "Failed Services"
systemctl --failed 2>/dev/null || echo "systemctl not available"
header "Listening Ports"
ss -tlnp | head -20
header "Recent Errors in syslog"
journalctl -p err --since "1 hour ago" --no-pager 2>/dev/null | tail -20
}
# ==========================================================
# CPU Investigation
# ==========================================================
investigate_cpu() {
header "CPU Investigation"
echo "Load averages (1/5/15 min):"
cat /proc/loadavg
echo "CPU cores: $(nproc)"
echo ""
local load_1m=$(cat /proc/loadavg | awk '{print $1}')
local cores=$(nproc)
echo "Load per core: $(echo "$load_1m $cores" | awk '{printf "%.2f", $1/$2}')"
header "CPU-hungry processes"
ps -eo pid,user,%cpu,%mem,comm --sort=-%cpu | head -15
header "Per-CPU usage"
mpstat -P ALL 1 3 2>/dev/null || sar -u 1 3 2>/dev/null || echo "Install sysstat"
header "Thread count by process"
ps -eo nlwp,pid,user,comm --sort=-nlwp | head -10
}
# ==========================================================
# Memory Investigation
# ==========================================================
investigate_memory() {
header "Memory Investigation"
free -h
echo ""
header "Memory-hungry processes"
ps -eo pid,user,%mem,rss,vsz,comm --sort=-%mem | head -15
header "Memory details from /proc/meminfo"
grep -E "^(MemTotal|MemFree|MemAvailable|Buffers|Cached|SwapTotal|SwapFree)" /proc/meminfo
header "OOM score (higher = more likely to be killed)"
for pid in $(ps -eo pid --sort=-%mem --no-headers | head -5); do
name=$(cat /proc/$pid/comm 2>/dev/null || echo "unknown")
score=$(cat /proc/$pid/oom_score 2>/dev/null || echo "N/A")
echo " PID $pid ($name): OOM score = $score"
done
header "Swap usage by process"
for f in /proc/[0-9]*/status; do
awk '/VmSwap/ {if ($2 > 0) print FILENAME, $0}' "$f" 2>/dev/null
done | sort -k3 -rn | head -10
}
# ==========================================================
# Disk Investigation
# ==========================================================
investigate_disk() {
header "Disk Investigation"
df -h
echo ""
header "Disk I/O"
iostat -x 1 3 2>/dev/null || echo "Install sysstat"
header "Large files (>100MB)"
find / -type f -size +100M -exec ls -lh {} \; 2>/dev/null | sort -k5 -rh | head -10
header "Large directories"
du -sh /var/log/* 2>/dev/null | sort -rh | head -10
echo "---"
du -sh /tmp/* 2>/dev/null | sort -rh | head -10
header "Inode usage"
df -i / /var /tmp 2>/dev/null
header "Open files (top consumers)"
lsof 2>/dev/null | awk '{print $1}' | sort | uniq -c | sort -rn | head -10
}
# ==========================================================
# Kubernetes Investigation
# ==========================================================
investigate_k8s() {
header "Kubernetes Cluster Health"
echo "--- Node Status ---"
kubectl get nodes -o wide
echo -e "\n--- Pod Status (${NAMESPACE}) ---"
kubectl get pods -n "$NAMESPACE" -o wide
header "Unhealthy Pods"
kubectl get pods -n "$NAMESPACE" --field-selector='status.phase!=Running,status.phase!=Succeeded'
header "Pod Restart Counts (top 10)"
kubectl get pods -n "$NAMESPACE" -o json | \
jq -r '.items[] | select(.status.containerStatuses != null) |
.status.containerStatuses[] |
select(.restartCount > 0) |
"\(.restartCount)\t\(.name)"' | \
sort -rn | head -10
header "Recent Events (warnings)"
kubectl get events -n "$NAMESPACE" \
--field-selector type=Warning \
--sort-by='.lastTimestamp' | tail -20
header "Resource Usage"
kubectl top pods -n "$NAMESPACE" --sort-by=cpu 2>/dev/null | head -15
echo "---"
kubectl top nodes 2>/dev/null
header "Recent Pod Logs (crashed/restarting)"
for pod in $(kubectl get pods -n "$NAMESPACE" --no-headers | \
awk '$3 ~ /CrashLoop|Error|OOM/ {print $1}' | head -3); do
echo "--- Logs: $pod ---"
kubectl logs -n "$NAMESPACE" "$pod" --tail=30 --previous 2>/dev/null || \
kubectl logs -n "$NAMESPACE" "$pod" --tail=30 2>/dev/null
done
header "HPA Status"
kubectl get hpa -n "$NAMESPACE" 2>/dev/null
header "PVC Status"
kubectl get pvc -n "$NAMESPACE" 2>/dev/null
}
# ==========================================================
# Network Investigation
# ==========================================================
investigate_network() {
header "Network Investigation"
echo "--- Active connections ---"
ss -s
header "Listening services"
ss -tlnp
header "Connection states"
ss -tan | awk '{print $1}' | sort | uniq -c | sort -rn
header "Established connections per IP (top 10)"
ss -tn state established | awk '{print $5}' | \
cut -d: -f1 | sort | uniq -c | sort -rn | head -10
header "DNS resolution test"
for domain in api.example.com db.internal redis.internal; do
if dig +short "$domain" A 2>/dev/null | head -1; then
ok "$domain resolves"
else
fail "$domain FAILED to resolve"
fi
done
header "TCP connectivity test"
for target in "db.internal:5432" "redis.internal:6379"; do
host=$(echo "$target" | cut -d: -f1)
port=$(echo "$target" | cut -d: -f2)
if timeout 3 bash -c "cat < /dev/null > /dev/tcp/$host/$port" 2>/dev/null; then
ok "$target reachable"
else
fail "$target UNREACHABLE"
fi
done
}
# ==========================================================
# Main dispatcher
# ==========================================================
case "${1:-triage}" in
triage) triage ;;
cpu) investigate_cpu ;;
memory) investigate_memory ;;
disk) investigate_disk ;;
network) investigate_network ;;
k8s) investigate_k8s ;;
all)
triage
investigate_cpu
investigate_memory
investigate_disk
investigate_network
;;
*)
echo "Usage: $0 [triage|cpu|memory|disk|network|k8s|all]"
exit 1
;;
esac
echo -e "\n${GREEN}=== Diagnostics complete: $(date -u +%Y-%m-%dT%H:%M:%SZ) ===${NC}"
Essential diagnostic commands and runbook procedures for production incidents. Quickly triage high CPU, memory leaks, disk full, and network issues with structured investigation scripts. Includes severity classification, escalation procedures, and post-incident review templates.