Cohort Analysis with CTEs
Cohort Analysis with CTEs
Cohort analysis groups users by when they first appeared (their "cohort") and tracks their behavior over time. It's the gold standard for measuring retention, engagement, and product health.
Why Cohort Analysis?
Aggregate metrics hide trends. If you see "30% of ec_customers made a second purchase," that hides whether:
- New cohorts are performing better or worse than old ones
- A product change improved retention for a specific cohort
- Seasonal cohorts behave differently
Cohort analysis reveals these patterns.
The Three Core Questions
- Acquisition: How many users joined each cohort?
- Retention: Of those, what fraction returned in period N?
- Revenue: How much did each cohort generate over time?
Example 1: Monthly Retention Cohort (ShopMetrics)
WITH customer_cohorts AS (
-- Assign each customer to a cohort based on first order month
SELECT
customer_id,
DATE_TRUNC('month', MIN(created_at))::DATE AS cohort_month
FROM ec_orders
GROUP BY customer_id
),
order_months AS (
-- For each order, calculate months since cohort
SELECT
o.customer_id,
cc.cohort_month,
DATE_TRUNC('month', o.created_at)::DATE AS order_month,
(EXTRACT(YEAR FROM o.created_at) - EXTRACT(YEAR FROM cc.cohort_month)) * 12 +
(EXTRACT(MONTH FROM o.created_at) - EXTRACT(MONTH FROM cc.cohort_month)) AS months_since_cohort
FROM ec_orders o
JOIN customer_cohorts cc ON cc.customer_id = o.customer_id
),
cohort_sizes AS (
SELECT cohort_month, COUNT(DISTINCT customer_id) AS cohort_size
FROM customer_cohorts
GROUP BY cohort_month
),
retention AS (
SELECT
cohort_month,
months_since_cohort,
COUNT(DISTINCT customer_id) AS active_customers
FROM order_months
GROUP BY cohort_month, months_since_cohort
)
SELECT
r.cohort_month,
cs.cohort_size,
r.months_since_cohort AS month_n,
r.active_customers,
ROUND(100.0 * r.active_customers / cs.cohort_size, 1) AS retention_pct
FROM retention r
JOIN cohort_sizes cs ON cs.cohort_month = r.cohort_month
ORDER BY r.cohort_month, r.months_since_cohort;
This produces a long-format table. You can pivot it to a retention matrix.
Example 2: Retention Matrix (Pivot)
Convert the long-format retention data into a classic cohort matrix:
WITH customer_cohorts AS (
SELECT customer_id,
DATE_TRUNC('month', MIN(created_at))::DATE AS cohort_month
FROM ec_orders GROUP BY customer_id
),
order_activity AS (
SELECT
o.customer_id,
cc.cohort_month,
(EXTRACT(YEAR FROM o.created_at) - EXTRACT(YEAR FROM cc.cohort_month)) * 12 +
(EXTRACT(MONTH FROM o.created_at) - EXTRACT(MONTH FROM cc.cohort_month)) AS month_n
FROM ec_orders o JOIN customer_cohorts cc ON cc.customer_id = o.customer_id
),
cohort_sizes AS (
SELECT cohort_month, COUNT(DISTINCT customer_id) AS size
FROM customer_cohorts GROUP BY cohort_month
),
activity_agg AS (
SELECT cohort_month, month_n, COUNT(DISTINCT customer_id) AS active
FROM order_activity GROUP BY cohort_month, month_n
)
SELECT
aa.cohort_month,
cs.size AS cohort_size,
ROUND(100.0 * MAX(CASE WHEN month_n = 0 THEN active END) / cs.size, 0) AS "M0",
ROUND(100.0 * MAX(CASE WHEN month_n = 1 THEN active END) / cs.size, 0) AS "M1",
ROUND(100.0 * MAX(CASE WHEN month_n = 2 THEN active END) / cs.size, 0) AS "M2",
ROUND(100.0 * MAX(CASE WHEN month_n = 3 THEN active END) / cs.size, 0) AS "M3",
ROUND(100.0 * MAX(CASE WHEN month_n = 6 THEN active END) / cs.size, 0) AS "M6"
FROM activity_agg aa
JOIN cohort_sizes cs ON cs.cohort_month = aa.cohort_month
GROUP BY aa.cohort_month, cs.size
ORDER BY aa.cohort_month;
Example 3: Revenue Cohort Analysis
Track cumulative revenue per cohort over time (LTV curves):
WITH customer_cohorts AS (
SELECT customer_id,
DATE_TRUNC('month', MIN(created_at))::DATE AS cohort_month
FROM ec_orders GROUP BY customer_id
),
monthly_revenue AS (
SELECT
o.customer_id,
cc.cohort_month,
DATE_TRUNC('month', o.created_at)::DATE AS revenue_month,
SUM(oi.quantity * oi.unit_price) AS revenue
FROM ec_orders o
JOIN ec_order_items oi ON oi.order_id = o.order_id
JOIN customer_cohorts cc ON cc.customer_id = o.customer_id
GROUP BY o.customer_id, cc.cohort_month, DATE_TRUNC('month', o.created_at)
),
cohort_revenue AS (
SELECT
cohort_month,
(EXTRACT(YEAR FROM revenue_month) - EXTRACT(YEAR FROM cohort_month)) * 12 +
(EXTRACT(MONTH FROM revenue_month) - EXTRACT(MONTH FROM cohort_month)) AS month_n,
SUM(revenue) AS period_revenue
FROM monthly_revenue
GROUP BY cohort_month, month_n
),
with_cumulative AS (
SELECT
cohort_month,
month_n,
period_revenue,
SUM(period_revenue) OVER (
PARTITION BY cohort_month
ORDER BY month_n
ROWS UNBOUNDED PRECEDING
) AS cumulative_revenue
FROM cohort_revenue
),
cohort_sizes AS (
SELECT cohort_month, COUNT(*) AS size FROM customer_cohorts GROUP BY cohort_month
)
SELECT
wc.cohort_month,
cs.size,
wc.month_n,
ROUND(wc.cumulative_revenue, 2) AS cumulative_revenue,
ROUND(wc.cumulative_revenue / cs.size, 2) AS ltv_per_customer
FROM with_cumulative wc
JOIN cohort_sizes cs ON cs.cohort_month = wc.cohort_month
ORDER BY wc.cohort_month, wc.month_n;
Example 4: Event-Based Cohorts (Stock Market Dataset)
Group companies by the year they first appeared in the stock prices dataset and compare their trading activity over time:
WITH company_cohorts AS (
SELECT
company_id,
DATE_TRUNC('year', MIN(price_date))::DATE AS cohort_year
FROM stock_prices
GROUP BY company_id
),
annual_activity AS (
SELECT
sp.company_id,
cc.cohort_year,
DATE_TRUNC('year', sp.price_date)::DATE AS trading_year,
COUNT(*) AS trading_days,
ROUND(AVG(sp.close_price), 2) AS avg_close_price,
ROUND(AVG(sp.volume), 0) AS avg_daily_volume
FROM stock_prices sp
JOIN company_cohorts cc ON cc.company_id = sp.company_id
GROUP BY sp.company_id, cc.cohort_year, DATE_TRUNC('year', sp.price_date)
)
SELECT
cohort_year,
trading_year,
COUNT(DISTINCT company_id) AS active_companies,
ROUND(AVG(trading_days), 0) AS avg_trading_days,
ROUND(AVG(avg_close_price), 2) AS avg_close_price
FROM annual_activity
GROUP BY cohort_year, trading_year
ORDER BY cohort_year, trading_year;