Sessionization — Reconstructing User Sessions from Events
Sessionization — Reconstructing User Sessions from Events
Raw clickstream events are individual rows. A session is a group of events from the same user with no gap longer than 30 minutes. Sessionization reconstructs these groups purely in SQL using window functions.
-- Assign session boundaries based on 30-minute idle timeout
WITH event_gaps AS (
SELECT
user_id,
session_id,
event_type,
page,
created_at,
LAG(created_at) OVER (
PARTITION BY user_id ORDER BY created_at
) AS prev_event_at,
EXTRACT(EPOCH FROM (
created_at
- LAG(created_at) OVER (
PARTITION BY user_id ORDER BY created_at
)
)) / 60 AS idle_minutes
FROM ts_web_events
),
session_starts AS (
SELECT *,
SUM(CASE WHEN idle_minutes IS NULL OR idle_minutes > 30 THEN 1 ELSE 0 END)
OVER (PARTITION BY user_id ORDER BY created_at)
AS synthetic_session_id
FROM event_gaps
),
-- Add exit page before GROUP BY (window functions cannot be used inside FILTER)
with_exit AS (
SELECT *,
LAST_VALUE(page) OVER (
PARTITION BY user_id, synthetic_session_id
ORDER BY created_at
ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
) AS exit_page
FROM session_starts
)
SELECT
user_id,
synthetic_session_id,
COUNT(*) AS event_count,
MIN(created_at) AS session_start,
MAX(created_at) AS session_end,
ROUND(EXTRACT(EPOCH FROM (MAX(created_at) - MIN(created_at))) / 60, 1)
AS duration_minutes,
COUNT(*) FILTER (WHERE event_type = 'page_view') AS page_views,
COUNT(*) FILTER (WHERE event_type = 'purchase') AS purchases,
bool_or(event_type = 'purchase') AS converted,
MIN(page) AS landing_page,
MAX(exit_page) AS exit_page
FROM with_exit
GROUP BY user_id, synthetic_session_id
HAVING COUNT(*) > 1
ORDER BY duration_minutes DESC
LIMIT 20;