Gaps and Islands — Detecting Continuous Runs & Outages
Gaps and Islands — Detecting Continuous Runs & Outages
The gaps-and-islands problem: given a sequence of timestamped events, find contiguous runs (islands) and the breaks between them (gaps). Classic use cases: uptime/downtime periods, consecutive days of activity, unbroken trading sessions.
-- Find continuous high-CPU periods per host (cpu > 70% for 10+ consecutive minutes)
WITH flagged AS (
SELECT
host,
recorded_at,
cpu_pct,
CASE WHEN cpu_pct > 70 THEN 1 ELSE 0 END AS is_high
FROM ts_server_metrics
WHERE recorded_at >= NOW() - INTERVAL '7 days'
),
-- Double ROW_NUMBER trick to identify islands
island_groups AS (
SELECT
host,
recorded_at,
cpu_pct,
is_high,
ROW_NUMBER() OVER (PARTITION BY host ORDER BY recorded_at)
- ROW_NUMBER() OVER (PARTITION BY host, is_high ORDER BY recorded_at)
AS grp
FROM flagged
)
SELECT
host,
is_high,
MIN(recorded_at) AS period_start,
MAX(recorded_at) AS period_end,
COUNT(*) AS minutes_duration,
ROUND(AVG(cpu_pct)::numeric, 1) AS avg_cpu_pct,
ROUND(MAX(cpu_pct)::numeric, 1) AS peak_cpu_pct
FROM island_groups
WHERE is_high = 1
GROUP BY host, is_high, grp
HAVING COUNT(*) >= 10
ORDER BY minutes_duration DESC
LIMIT 20;