-- Connection statistics
SELECT
count(*) AS total_connections,
count(*) FILTER (WHERE state = 'active') AS active,
count(*) FILTER (WHERE state = 'idle') AS idle,
count(*) FILTER (WHERE state = 'idle in transaction') AS idle_in_tx,
count(*) FILTER (WHERE wait_event_type IS NOT NULL) AS waiting
FROM pg_stat_activity;
-- Connection limits and usage
SELECT
setting::INT AS max_connections,
(SELECT count(*) FROM pg_stat_activity) AS current_connections,
setting::INT - (SELECT count(*) FROM pg_stat_activity) AS available_connections,
ROUND(100.0 * (SELECT count(*) FROM pg_stat_activity) / setting::INT, 2) AS pct_used
FROM pg_settings
WHERE name = 'max_connections';
-- Query performance metrics (pg_stat_statements)
SELECT
LEFT(query, 100) AS query_preview,
calls,
ROUND(total_exec_time::numeric, 2) AS total_ms,
ROUND(mean_exec_time::numeric, 2) AS mean_ms,
ROUND(min_exec_time::numeric, 2) AS min_ms,
ROUND(max_exec_time::numeric, 2) AS max_ms,
ROUND(stddev_exec_time::numeric, 2) AS stddev_ms,
rows
FROM pg_stat_statements
ORDER BY total_exec_time DESC
LIMIT 10;
-- Queries per second (QPS)
WITH current_stats AS (
SELECT SUM(calls) AS total_calls, NOW() AS measured_at
FROM pg_stat_statements
)
SELECT
total_calls,
ROUND(
total_calls / EXTRACT(EPOCH FROM (measured_at - pg_postmaster_start_time())),
2
) AS queries_per_second
FROM current_stats;
-- Cache hit ratio (should be > 99%)
SELECT
'index' AS type,
ROUND(
100.0 * SUM(idx_blks_hit) / NULLIF(SUM(idx_blks_hit + idx_blks_read), 0),
2
) AS hit_ratio_pct
FROM pg_statio_user_indexes
UNION ALL
SELECT
'table' AS type,
ROUND(
100.0 * SUM(heap_blks_hit) / NULLIF(SUM(heap_blks_hit + heap_blks_read), 0),
2
) AS hit_ratio_pct
FROM pg_statio_user_tables;
-- Database size and growth
SELECT
datname,
pg_size_pretty(pg_database_size(datname)) AS size,
pg_database_size(datname) AS size_bytes
FROM pg_database
WHERE datname NOT IN ('template0', 'template1')
ORDER BY pg_database_size(datname) DESC;
-- Table size and bloat indicators
SELECT
schemaname,
tablename,
pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename)) AS total_size,
pg_size_pretty(pg_relation_size(schemaname||'.'||tablename)) AS table_size,
n_live_tup,
n_dead_tup,
ROUND(100.0 * n_dead_tup / NULLIF(n_live_tup + n_dead_tup, 0), 2) AS dead_pct,
last_vacuum,
last_autovacuum
FROM pg_stat_user_tables
WHERE n_live_tup > 0
ORDER BY pg_total_relation_size(schemaname||'.'||tablename) DESC
LIMIT 20;
-- Transaction rate
SELECT
datname,
xact_commit + xact_rollback AS total_transactions,
xact_commit,
xact_rollback,
ROUND(100.0 * xact_rollback / NULLIF(xact_commit + xact_rollback, 0), 2) AS rollback_pct,
blks_read,
blks_hit,
ROUND(100.0 * blks_hit / NULLIF(blks_read + blks_hit, 0), 2) AS cache_hit_pct
FROM pg_stat_database
WHERE datname = current_database();
-- Lock monitoring
SELECT
locktype,
mode,
COUNT(*) AS lock_count
FROM pg_locks
GROUP BY locktype, mode
ORDER BY lock_count DESC;
-- Blocking queries
SELECT
blocked.pid AS blocked_pid,
blocked.query AS blocked_query,
blocking.pid AS blocking_pid,
blocking.query AS blocking_query,
NOW() - blocked.query_start AS blocked_duration
FROM pg_stat_activity blocked
JOIN pg_locks blocked_locks ON blocked.pid = blocked_locks.pid
JOIN pg_locks blocking_locks ON blocked_locks.locktype = blocking_locks.locktype
AND blocked_locks.database IS NOT DISTINCT FROM blocking_locks.database
AND blocked_locks.relation IS NOT DISTINCT FROM blocking_locks.relation
AND blocked_locks.pid != blocking_locks.pid
JOIN pg_stat_activity blocking ON blocking_locks.pid = blocking.pid
WHERE NOT blocked_locks.granted
AND blocking_locks.granted;
-- Replication lag
SELECT
client_addr,
application_name,
state,
pg_wal_lsn_diff(pg_current_wal_lsn(), replay_lsn) AS lag_bytes,
pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_lsn(), replay_lsn)) AS lag,
EXTRACT(EPOCH FROM (NOW() - pg_last_xact_replay_timestamp())) AS lag_seconds
FROM pg_stat_replication;
-- Checkpoint statistics
SELECT
checkpoints_timed,
checkpoints_req,
ROUND(100.0 * checkpoints_req / NULLIF(checkpoints_timed + checkpoints_req, 0), 2) AS req_checkpoint_pct,
buffers_checkpoint,
buffers_clean,
maxwritten_clean,
buffers_backend,
buffers_backend_fsync,
checkpoint_write_time,
checkpoint_sync_time
FROM pg_stat_bgwriter;
-- Alert: High connection usage (> 80%)
WITH conn_stats AS (
SELECT
setting::INT AS max_conn,
(SELECT count(*) FROM pg_stat_activity) AS current_conn
FROM pg_settings WHERE name = 'max_connections'
)
SELECT
'HIGH_CONNECTION_USAGE' AS alert,
current_conn,
max_conn,
ROUND(100.0 * current_conn / max_conn, 2) AS pct_used
FROM conn_stats
WHERE current_conn > max_conn * 0.8;
-- Alert: Long-running queries (> 5 minutes)
SELECT
'LONG_RUNNING_QUERY' AS alert,
pid,
usename,
datname,
state,
NOW() - query_start AS duration,
query
FROM pg_stat_activity
WHERE state != 'idle'
AND query NOT LIKE '%pg_stat_activity%'
AND NOW() - query_start > INTERVAL '5 minutes'
ORDER BY duration DESC;
-- Alert: Idle in transaction (> 1 minute)
SELECT
'IDLE_IN_TRANSACTION' AS alert,
pid,
usename,
application_name,
NOW() - state_change AS idle_duration,
query
FROM pg_stat_activity
WHERE state = 'idle in transaction'
AND NOW() - state_change > INTERVAL '1 minute';
-- Alert: High table bloat (> 20% dead tuples)
SELECT
'HIGH_TABLE_BLOAT' AS alert,
schemaname,
tablename,
n_live_tup,
n_dead_tup,
ROUND(100.0 * n_dead_tup / NULLIF(n_live_tup + n_dead_tup, 0), 2) AS dead_pct
FROM pg_stat_user_tables
WHERE n_live_tup > 1000
AND ROUND(100.0 * n_dead_tup / NULLIF(n_live_tup + n_dead_tup, 0), 2) > 20
ORDER BY n_dead_tup DESC;
-- Alert: Cache hit ratio low (< 95%)
WITH cache_stats AS (
SELECT
ROUND(
100.0 * SUM(blks_hit) / NULLIF(SUM(blks_hit + blks_read), 0),
2
) AS hit_ratio
FROM pg_stat_database
WHERE datname = current_database()
)
SELECT
'LOW_CACHE_HIT_RATIO' AS alert,
hit_ratio
FROM cache_stats
WHERE hit_ratio < 95;
-- Alert: Replication lag (> 100MB)
SELECT
'HIGH_REPLICATION_LAG' AS alert,
client_addr,
application_name,
pg_wal_lsn_diff(pg_current_wal_lsn(), replay_lsn) AS lag_bytes,
pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_lsn(), replay_lsn)) AS lag
FROM pg_stat_replication
WHERE pg_wal_lsn_diff(pg_current_wal_lsn(), replay_lsn) > 100 * 1024 * 1024;
-- Alert: Transaction ID wraparound risk (> 50%)
SELECT
'XID_WRAPAROUND_RISK' AS alert,
datname,
age(datfrozenxid) AS xid_age,
ROUND(100.0 * age(datfrozenxid) / 2147483647, 2) AS pct_toward_wraparound
FROM pg_database
WHERE age(datfrozenxid) > 2147483647 * 0.5
ORDER BY age(datfrozenxid) DESC;
-- Alert: Unused indexes (never scanned, > 100MB)
SELECT
'UNUSED_INDEX' AS alert,
schemaname,
tablename,
indexname,
pg_size_pretty(pg_relation_size(indexrelid)) AS index_size,
idx_scan
FROM pg_stat_user_indexes
WHERE idx_scan = 0
AND indexrelname NOT LIKE '%_pkey'
AND pg_relation_size(indexrelid) > 100 * 1024 * 1024
ORDER BY pg_relation_size(indexrelid) DESC;
-- Health check dashboard query
SELECT
jsonb_build_object(
'connections', (
SELECT jsonb_build_object(
'current', count(*),
'active', count(*) FILTER (WHERE state = 'active'),
'max', (SELECT setting::INT FROM pg_settings WHERE name = 'max_connections')
)
FROM pg_stat_activity
),
'cache_hit_ratio', (
SELECT ROUND(
100.0 * SUM(blks_hit) / NULLIF(SUM(blks_hit + blks_read), 0),
2
)
FROM pg_stat_database
WHERE datname = current_database()
),
'database_size', (
SELECT pg_size_pretty(pg_database_size(current_database()))
),
'qps', (
SELECT ROUND(
SUM(xact_commit + xact_rollback) /
EXTRACT(EPOCH FROM (NOW() - stats_reset)),
2
)
FROM pg_stat_database
WHERE datname = current_database()
),
'bloated_tables', (
SELECT count(*)
FROM pg_stat_user_tables
WHERE n_live_tup > 1000
AND 100.0 * n_dead_tup / NULLIF(n_live_tup + n_dead_tup, 0) > 20
)
) AS health_metrics;
-- Time-series metrics collection (run every minute)
CREATE TABLE IF NOT EXISTS db_metrics (
collected_at TIMESTAMP NOT NULL,
metric_name VARCHAR(100) NOT NULL,
metric_value DECIMAL(20,2),
PRIMARY KEY (collected_at, metric_name)
);
-- Collect metrics
INSERT INTO db_metrics (collected_at, metric_name, metric_value)
SELECT
NOW(),
metric_name,
metric_value
FROM (
SELECT 'connections_total' AS metric_name,
count(*)::DECIMAL AS metric_value
FROM pg_stat_activity
UNION ALL
SELECT 'connections_active',
count(*) FILTER (WHERE state = 'active')
FROM pg_stat_activity
UNION ALL
SELECT 'cache_hit_ratio',
100.0 * SUM(blks_hit) / NULLIF(SUM(blks_hit + blks_read), 0)
FROM pg_stat_database
WHERE datname = current_database()
UNION ALL
SELECT 'database_size_mb',
pg_database_size(current_database()) / (1024.0 * 1024.0)
) metrics;