Database observability and monitoring metrics
Maria Garcia
Feb 2026
2 tabs
-- Connection statistics
SELECT
count(*) AS total_connections,
count(*) FILTER (WHERE state = 'active') AS active,
count(*) FILTER (WHERE state = 'idle') AS idle,
count(*) FILTER (WHERE state = 'idle in transaction') AS idle_in_tx,
count(*) FILTER (WHERE wait_event_type IS NOT NULL) AS waiting
FROM pg_stat_activity;
-- Connection limits and usage
SELECT
setting::INT AS max_connections,
(SELECT count(*) FROM pg_stat_activity) AS current_connections,
setting::INT - (SELECT count(*) FROM pg_stat_activity) AS available_connections,
ROUND(100.0 * (SELECT count(*) FROM pg_stat_activity) / setting::INT, 2) AS pct_used
FROM pg_settings
WHERE name = 'max_connections';
-- Query performance metrics (pg_stat_statements)
SELECT
LEFT(query, 100) AS query_preview,
calls,
ROUND(total_exec_time::numeric, 2) AS total_ms,
ROUND(mean_exec_time::numeric, 2) AS mean_ms,
ROUND(min_exec_time::numeric, 2) AS min_ms,
ROUND(max_exec_time::numeric, 2) AS max_ms,
ROUND(stddev_exec_time::numeric, 2) AS stddev_ms,
rows
FROM pg_stat_statements
ORDER BY total_exec_time DESC
LIMIT 10;
-- Queries per second (QPS)
WITH current_stats AS (
SELECT SUM(calls) AS total_calls, NOW() AS measured_at
FROM pg_stat_statements
)
SELECT
total_calls,
ROUND(
total_calls / EXTRACT(EPOCH FROM (measured_at - pg_postmaster_start_time())),
2
) AS queries_per_second
FROM current_stats;
-- Cache hit ratio (should be > 99%)
SELECT
'index' AS type,
ROUND(
100.0 * SUM(idx_blks_hit) / NULLIF(SUM(idx_blks_hit + idx_blks_read), 0),
2
) AS hit_ratio_pct
FROM pg_statio_user_indexes
UNION ALL
SELECT
'table' AS type,
ROUND(
100.0 * SUM(heap_blks_hit) / NULLIF(SUM(heap_blks_hit + heap_blks_read), 0),
2
) AS hit_ratio_pct
FROM pg_statio_user_tables;
-- Database size and growth
SELECT
datname,
pg_size_pretty(pg_database_size(datname)) AS size,
pg_database_size(datname) AS size_bytes
FROM pg_database
WHERE datname NOT IN ('template0', 'template1')
ORDER BY pg_database_size(datname) DESC;
-- Table size and bloat indicators
SELECT
schemaname,
tablename,
pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename)) AS total_size,
pg_size_pretty(pg_relation_size(schemaname||'.'||tablename)) AS table_size,
n_live_tup,
n_dead_tup,
ROUND(100.0 * n_dead_tup / NULLIF(n_live_tup + n_dead_tup, 0), 2) AS dead_pct,
last_vacuum,
last_autovacuum
FROM pg_stat_user_tables
WHERE n_live_tup > 0
ORDER BY pg_total_relation_size(schemaname||'.'||tablename) DESC
LIMIT 20;
-- Transaction rate
SELECT
datname,
xact_commit + xact_rollback AS total_transactions,
xact_commit,
xact_rollback,
ROUND(100.0 * xact_rollback / NULLIF(xact_commit + xact_rollback, 0), 2) AS rollback_pct,
blks_read,
blks_hit,
ROUND(100.0 * blks_hit / NULLIF(blks_read + blks_hit, 0), 2) AS cache_hit_pct
FROM pg_stat_database
WHERE datname = current_database();
-- Lock monitoring
SELECT
locktype,
mode,
COUNT(*) AS lock_count
FROM pg_locks
GROUP BY locktype, mode
ORDER BY lock_count DESC;
-- Blocking queries
SELECT
blocked.pid AS blocked_pid,
blocked.query AS blocked_query,
blocking.pid AS blocking_pid,
blocking.query AS blocking_query,
NOW() - blocked.query_start AS blocked_duration
FROM pg_stat_activity blocked
JOIN pg_locks blocked_locks ON blocked.pid = blocked_locks.pid
JOIN pg_locks blocking_locks ON blocked_locks.locktype = blocking_locks.locktype
AND blocked_locks.database IS NOT DISTINCT FROM blocking_locks.database
AND blocked_locks.relation IS NOT DISTINCT FROM blocking_locks.relation
AND blocked_locks.pid != blocking_locks.pid
JOIN pg_stat_activity blocking ON blocking_locks.pid = blocking.pid
WHERE NOT blocked_locks.granted
AND blocking_locks.granted;
-- Replication lag
SELECT
client_addr,
application_name,
state,
pg_wal_lsn_diff(pg_current_wal_lsn(), replay_lsn) AS lag_bytes,
pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_lsn(), replay_lsn)) AS lag,
EXTRACT(EPOCH FROM (NOW() - pg_last_xact_replay_timestamp())) AS lag_seconds
FROM pg_stat_replication;
-- Checkpoint statistics
SELECT
checkpoints_timed,
checkpoints_req,
ROUND(100.0 * checkpoints_req / NULLIF(checkpoints_timed + checkpoints_req, 0), 2) AS req_checkpoint_pct,
buffers_checkpoint,
buffers_clean,
maxwritten_clean,
buffers_backend,
buffers_backend_fsync,
checkpoint_write_time,
checkpoint_sync_time
FROM pg_stat_bgwriter;
-- Alert: High connection usage (> 80%)
WITH conn_stats AS (
SELECT
setting::INT AS max_conn,
(SELECT count(*) FROM pg_stat_activity) AS current_conn
FROM pg_settings WHERE name = 'max_connections'
)
SELECT
'HIGH_CONNECTION_USAGE' AS alert,
current_conn,
max_conn,
ROUND(100.0 * current_conn / max_conn, 2) AS pct_used
FROM conn_stats
WHERE current_conn > max_conn * 0.8;
-- Alert: Long-running queries (> 5 minutes)
SELECT
'LONG_RUNNING_QUERY' AS alert,
pid,
usename,
datname,
state,
NOW() - query_start AS duration,
query
FROM pg_stat_activity
WHERE state != 'idle'
AND query NOT LIKE '%pg_stat_activity%'
AND NOW() - query_start > INTERVAL '5 minutes'
ORDER BY duration DESC;
-- Alert: Idle in transaction (> 1 minute)
SELECT
'IDLE_IN_TRANSACTION' AS alert,
pid,
usename,
application_name,
NOW() - state_change AS idle_duration,
query
FROM pg_stat_activity
WHERE state = 'idle in transaction'
AND NOW() - state_change > INTERVAL '1 minute';
-- Alert: High table bloat (> 20% dead tuples)
SELECT
'HIGH_TABLE_BLOAT' AS alert,
schemaname,
tablename,
n_live_tup,
n_dead_tup,
ROUND(100.0 * n_dead_tup / NULLIF(n_live_tup + n_dead_tup, 0), 2) AS dead_pct
FROM pg_stat_user_tables
WHERE n_live_tup > 1000
AND ROUND(100.0 * n_dead_tup / NULLIF(n_live_tup + n_dead_tup, 0), 2) > 20
ORDER BY n_dead_tup DESC;
-- Alert: Cache hit ratio low (< 95%)
WITH cache_stats AS (
SELECT
ROUND(
100.0 * SUM(blks_hit) / NULLIF(SUM(blks_hit + blks_read), 0),
2
) AS hit_ratio
FROM pg_stat_database
WHERE datname = current_database()
)
SELECT
'LOW_CACHE_HIT_RATIO' AS alert,
hit_ratio
FROM cache_stats
WHERE hit_ratio < 95;
-- Alert: Replication lag (> 100MB)
SELECT
'HIGH_REPLICATION_LAG' AS alert,
client_addr,
application_name,
pg_wal_lsn_diff(pg_current_wal_lsn(), replay_lsn) AS lag_bytes,
pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_lsn(), replay_lsn)) AS lag
FROM pg_stat_replication
WHERE pg_wal_lsn_diff(pg_current_wal_lsn(), replay_lsn) > 100 * 1024 * 1024;
-- Alert: Transaction ID wraparound risk (> 50%)
SELECT
'XID_WRAPAROUND_RISK' AS alert,
datname,
age(datfrozenxid) AS xid_age,
ROUND(100.0 * age(datfrozenxid) / 2147483647, 2) AS pct_toward_wraparound
FROM pg_database
WHERE age(datfrozenxid) > 2147483647 * 0.5
ORDER BY age(datfrozenxid) DESC;
-- Alert: Unused indexes (never scanned, > 100MB)
SELECT
'UNUSED_INDEX' AS alert,
schemaname,
tablename,
indexname,
pg_size_pretty(pg_relation_size(indexrelid)) AS index_size,
idx_scan
FROM pg_stat_user_indexes
WHERE idx_scan = 0
AND indexrelname NOT LIKE '%_pkey'
AND pg_relation_size(indexrelid) > 100 * 1024 * 1024
ORDER BY pg_relation_size(indexrelid) DESC;
-- Health check dashboard query
SELECT
jsonb_build_object(
'connections', (
SELECT jsonb_build_object(
'current', count(*),
'active', count(*) FILTER (WHERE state = 'active'),
'max', (SELECT setting::INT FROM pg_settings WHERE name = 'max_connections')
)
FROM pg_stat_activity
),
'cache_hit_ratio', (
SELECT ROUND(
100.0 * SUM(blks_hit) / NULLIF(SUM(blks_hit + blks_read), 0),
2
)
FROM pg_stat_database
WHERE datname = current_database()
),
'database_size', (
SELECT pg_size_pretty(pg_database_size(current_database()))
),
'qps', (
SELECT ROUND(
SUM(xact_commit + xact_rollback) /
EXTRACT(EPOCH FROM (NOW() - stats_reset)),
2
)
FROM pg_stat_database
WHERE datname = current_database()
),
'bloated_tables', (
SELECT count(*)
FROM pg_stat_user_tables
WHERE n_live_tup > 1000
AND 100.0 * n_dead_tup / NULLIF(n_live_tup + n_dead_tup, 0) > 20
)
) AS health_metrics;
-- Time-series metrics collection (run every minute)
CREATE TABLE IF NOT EXISTS db_metrics (
collected_at TIMESTAMP NOT NULL,
metric_name VARCHAR(100) NOT NULL,
metric_value DECIMAL(20,2),
PRIMARY KEY (collected_at, metric_name)
);
-- Collect metrics
INSERT INTO db_metrics (collected_at, metric_name, metric_value)
SELECT
NOW(),
metric_name,
metric_value
FROM (
SELECT 'connections_total' AS metric_name,
count(*)::DECIMAL AS metric_value
FROM pg_stat_activity
UNION ALL
SELECT 'connections_active',
count(*) FILTER (WHERE state = 'active')
FROM pg_stat_activity
UNION ALL
SELECT 'cache_hit_ratio',
100.0 * SUM(blks_hit) / NULLIF(SUM(blks_hit + blks_read), 0)
FROM pg_stat_database
WHERE datname = current_database()
UNION ALL
SELECT 'database_size_mb',
pg_database_size(current_database()) / (1024.0 * 1024.0)
) metrics;
2 files · sql
Explain with highlit
Observability provides insight into database health and performance. I monitor key metrics—queries per second, connection count, cache hit ratio. Slow query logs identify performance problems. Query latency percentiles show user experience. Lock wait time reveals contention. Replication lag affects read consistency. Disk I/O patterns guide hardware decisions. Understanding baseline metrics enables anomaly detection. Database size growth predicts capacity needs. Index usage statistics inform optimization. Proper observability prevents outages through early warning. Essential for production databases, SRE practices. PostgreSQL provides rich statistics views for comprehensive monitoring.