Incident response runbook and diagnostic scripts

10799 views
                  #!/usr/bin/env bash
set -euo pipefail

# ==========================================================
# Production Incident Response Runbook
# ==========================================================
# Usage: ./incident-runbook.sh [triage|cpu|memory|disk|network|k8s]
# ==========================================================

NAMESPACE="${NAMESPACE:-production}"
APP="${APP:-myapp}"

# Colors for output
RED='\033[0;31m'
YELLOW='\033[1;33m'
GREEN='\033[0;32m'
NC='\033[0m'

header() { echo -e "\n${YELLOW}=== $1 ===${NC}\n"; }
ok()     { echo -e "${GREEN}[OK]${NC} $1"; }
warn()   { echo -e "${YELLOW}[WARN]${NC} $1"; }
fail()   { echo -e "${RED}[FAIL]${NC} $1"; }

# ==========================================================
# TRIAGE: Quick health overview
# ==========================================================
triage() {
  header "System Overview"
  echo "Timestamp: $(date -u +%Y-%m-%dT%H:%M:%SZ)"
  echo "Hostname:  $(hostname)"
  echo "Uptime:    $(uptime)"

  header "Resource Usage"
  echo "--- CPU ---"
  top -bn1 | head -5

  echo -e "\n--- Memory ---"
  free -h

  echo -e "\n--- Disk ---"
  df -h / /var /tmp 2>/dev/null

  echo -e "\n--- Load Average ---"
  cat /proc/loadavg

  header "Top Processes (CPU)"
  ps aux --sort=-%cpu | head -10

  header "Top Processes (Memory)"
  ps aux --sort=-%mem | head -10

  header "Recent OOM Kills"
  dmesg -T 2>/dev/null | grep -i "oom\|killed" | tail -5 || echo "None found"

  header "Failed Services"
  systemctl --failed 2>/dev/null || echo "systemctl not available"

  header "Listening Ports"
  ss -tlnp | head -20

  header "Recent Errors in syslog"
  journalctl -p err --since "1 hour ago" --no-pager 2>/dev/null | tail -20
}

# ==========================================================
# CPU Investigation
# ==========================================================
investigate_cpu() {
  header "CPU Investigation"

  echo "Load averages (1/5/15 min):"
  cat /proc/loadavg
  echo "CPU cores: $(nproc)"
  echo ""

  local load_1m=$(cat /proc/loadavg | awk '{print $1}')
  local cores=$(nproc)
  echo "Load per core: $(echo "$load_1m $cores" | awk '{printf "%.2f", $1/$2}')"

  header "CPU-hungry processes"
  ps -eo pid,user,%cpu,%mem,comm --sort=-%cpu | head -15

  header "Per-CPU usage"
  mpstat -P ALL 1 3 2>/dev/null || sar -u 1 3 2>/dev/null || echo "Install sysstat"

  header "Thread count by process"
  ps -eo nlwp,pid,user,comm --sort=-nlwp | head -10
}

# ==========================================================
# Memory Investigation
# ==========================================================
investigate_memory() {
  header "Memory Investigation"

  free -h
  echo ""

  header "Memory-hungry processes"
  ps -eo pid,user,%mem,rss,vsz,comm --sort=-%mem | head -15

  header "Memory details from /proc/meminfo"
  grep -E "^(MemTotal|MemFree|MemAvailable|Buffers|Cached|SwapTotal|SwapFree)" /proc/meminfo

  header "OOM score (higher = more likely to be killed)"
  for pid in $(ps -eo pid --sort=-%mem --no-headers | head -5); do
    name=$(cat /proc/$pid/comm 2>/dev/null || echo "unknown")
    score=$(cat /proc/$pid/oom_score 2>/dev/null || echo "N/A")
    echo "  PID $pid ($name): OOM score = $score"
  done

  header "Swap usage by process"
  for f in /proc/[0-9]*/status; do
    awk '/VmSwap/ {if ($2 > 0) print FILENAME, $0}' "$f" 2>/dev/null
  done | sort -k3 -rn | head -10
}

# ==========================================================
# Disk Investigation
# ==========================================================
investigate_disk() {
  header "Disk Investigation"

  df -h
  echo ""

  header "Disk I/O"
  iostat -x 1 3 2>/dev/null || echo "Install sysstat"

  header "Large files (>100MB)"
  find / -type f -size +100M -exec ls -lh {} \; 2>/dev/null | sort -k5 -rh | head -10

  header "Large directories"
  du -sh /var/log/* 2>/dev/null | sort -rh | head -10
  echo "---"
  du -sh /tmp/* 2>/dev/null | sort -rh | head -10

  header "Inode usage"
  df -i / /var /tmp 2>/dev/null

  header "Open files (top consumers)"
  lsof 2>/dev/null | awk '{print $1}' | sort | uniq -c | sort -rn | head -10
}

# ==========================================================
# Kubernetes Investigation
# ==========================================================
investigate_k8s() {
  header "Kubernetes Cluster Health"

  echo "--- Node Status ---"
  kubectl get nodes -o wide

  echo -e "\n--- Pod Status (${NAMESPACE}) ---"
  kubectl get pods -n "$NAMESPACE" -o wide

  header "Unhealthy Pods"
  kubectl get pods -n "$NAMESPACE" --field-selector='status.phase!=Running,status.phase!=Succeeded'

  header "Pod Restart Counts (top 10)"
  kubectl get pods -n "$NAMESPACE" -o json | \
    jq -r '.items[] | select(.status.containerStatuses != null) |
      .status.containerStatuses[] |
      select(.restartCount > 0) |
      "\(.restartCount)\t\(.name)"' | \
    sort -rn | head -10

  header "Recent Events (warnings)"
  kubectl get events -n "$NAMESPACE" \
    --field-selector type=Warning \
    --sort-by='.lastTimestamp' | tail -20

  header "Resource Usage"
  kubectl top pods -n "$NAMESPACE" --sort-by=cpu 2>/dev/null | head -15
  echo "---"
  kubectl top nodes 2>/dev/null

  header "Recent Pod Logs (crashed/restarting)"
  for pod in $(kubectl get pods -n "$NAMESPACE" --no-headers | \
    awk '$3 ~ /CrashLoop|Error|OOM/ {print $1}' | head -3); do
    echo "--- Logs: $pod ---"
    kubectl logs -n "$NAMESPACE" "$pod" --tail=30 --previous 2>/dev/null || \
      kubectl logs -n "$NAMESPACE" "$pod" --tail=30 2>/dev/null
  done

  header "HPA Status"
  kubectl get hpa -n "$NAMESPACE" 2>/dev/null

  header "PVC Status"
  kubectl get pvc -n "$NAMESPACE" 2>/dev/null
}

# ==========================================================
# Network Investigation
# ==========================================================
investigate_network() {
  header "Network Investigation"

  echo "--- Active connections ---"
  ss -s

  header "Listening services"
  ss -tlnp

  header "Connection states"
  ss -tan | awk '{print $1}' | sort | uniq -c | sort -rn

  header "Established connections per IP (top 10)"
  ss -tn state established | awk '{print $5}' | \
    cut -d: -f1 | sort | uniq -c | sort -rn | head -10

  header "DNS resolution test"
  for domain in api.example.com db.internal redis.internal; do
    if dig +short "$domain" A 2>/dev/null | head -1; then
      ok "$domain resolves"
    else
      fail "$domain FAILED to resolve"
    fi
  done

  header "TCP connectivity test"
  for target in "db.internal:5432" "redis.internal:6379"; do
    host=$(echo "$target" | cut -d: -f1)
    port=$(echo "$target" | cut -d: -f2)
    if timeout 3 bash -c "cat < /dev/null > /dev/tcp/$host/$port" 2>/dev/null; then
      ok "$target reachable"
    else
      fail "$target UNREACHABLE"
    fi
  done
}

# ==========================================================
# Main dispatcher
# ==========================================================
case "${1:-triage}" in
  triage)   triage ;;
  cpu)      investigate_cpu ;;
  memory)   investigate_memory ;;
  disk)     investigate_disk ;;
  network)  investigate_network ;;
  k8s)      investigate_k8s ;;
  all)
    triage
    investigate_cpu
    investigate_memory
    investigate_disk
    investigate_network
    ;;
  *)
    echo "Usage: $0 [triage|cpu|memory|disk|network|k8s|all]"
    exit 1
    ;;
esac

echo -e "\n${GREEN}=== Diagnostics complete: $(date -u +%Y-%m-%dT%H:%M:%SZ) ===${NC}"
Essential diagnostic commands and runbook procedures for production incidents. Quickly triage high CPU, memory leaks, disk full, and network issues with structured investigation scripts. Includes severity classification, escalation procedures, and post-incident review templates.
Ryan Nakamura

More from Ryan Nakamura

Incident response runbook and diagnostic scripts

0 Comments

More from Ryan Nakamura