Cohort Analysis with CTEs

Cohort Analysis with CTEs

Cohort analysis groups users by when they first appeared (their "cohort") and tracks their behavior over time. It's the gold standard for measuring retention, engagement, and product health.

Why Cohort Analysis?

Aggregate metrics hide trends. If you see "30% of ec_customers made a second purchase," that hides whether:

  • New cohorts are performing better or worse than old ones
  • A product change improved retention for a specific cohort
  • Seasonal cohorts behave differently

Cohort analysis reveals these patterns.


The Three Core Questions

  1. Acquisition: How many users joined each cohort?
  2. Retention: Of those, what fraction returned in period N?
  3. Revenue: How much did each cohort generate over time?

Example 1: Monthly Retention Cohort (ShopMetrics)

WITH customer_cohorts AS (
    -- Assign each customer to a cohort based on first order month
    SELECT
        customer_id,
        DATE_TRUNC('month', MIN(created_at))::DATE AS cohort_month
    FROM ec_orders
    GROUP BY customer_id
),

order_months AS (
    -- For each order, calculate months since cohort
    SELECT
        o.customer_id,
        cc.cohort_month,
        DATE_TRUNC('month', o.created_at)::DATE AS order_month,
        (EXTRACT(YEAR  FROM o.created_at) - EXTRACT(YEAR  FROM cc.cohort_month)) * 12 +
        (EXTRACT(MONTH FROM o.created_at) - EXTRACT(MONTH FROM cc.cohort_month)) AS months_since_cohort
    FROM ec_orders o
    JOIN customer_cohorts cc ON cc.customer_id = o.customer_id
),

cohort_sizes AS (
    SELECT cohort_month, COUNT(DISTINCT customer_id) AS cohort_size
    FROM customer_cohorts
    GROUP BY cohort_month
),

retention AS (
    SELECT
        cohort_month,
        months_since_cohort,
        COUNT(DISTINCT customer_id) AS active_customers
    FROM order_months
    GROUP BY cohort_month, months_since_cohort
)

SELECT
    r.cohort_month,
    cs.cohort_size,
    r.months_since_cohort AS month_n,
    r.active_customers,
    ROUND(100.0 * r.active_customers / cs.cohort_size, 1) AS retention_pct
FROM retention r
JOIN cohort_sizes cs ON cs.cohort_month = r.cohort_month
ORDER BY r.cohort_month, r.months_since_cohort;

This produces a long-format table. You can pivot it to a retention matrix.


Example 2: Retention Matrix (Pivot)

Convert the long-format retention data into a classic cohort matrix:

WITH customer_cohorts AS (
    SELECT customer_id,
           DATE_TRUNC('month', MIN(created_at))::DATE AS cohort_month
    FROM ec_orders GROUP BY customer_id
),

order_activity AS (
    SELECT
        o.customer_id,
        cc.cohort_month,
        (EXTRACT(YEAR  FROM o.created_at) - EXTRACT(YEAR  FROM cc.cohort_month)) * 12 +
        (EXTRACT(MONTH FROM o.created_at) - EXTRACT(MONTH FROM cc.cohort_month)) AS month_n
    FROM ec_orders o JOIN customer_cohorts cc ON cc.customer_id = o.customer_id
),

cohort_sizes AS (
    SELECT cohort_month, COUNT(DISTINCT customer_id) AS size
    FROM customer_cohorts GROUP BY cohort_month
),

activity_agg AS (
    SELECT cohort_month, month_n, COUNT(DISTINCT customer_id) AS active
    FROM order_activity GROUP BY cohort_month, month_n
)

SELECT
    aa.cohort_month,
    cs.size AS cohort_size,
    ROUND(100.0 * MAX(CASE WHEN month_n = 0 THEN active END) / cs.size, 0) AS "M0",
    ROUND(100.0 * MAX(CASE WHEN month_n = 1 THEN active END) / cs.size, 0) AS "M1",
    ROUND(100.0 * MAX(CASE WHEN month_n = 2 THEN active END) / cs.size, 0) AS "M2",
    ROUND(100.0 * MAX(CASE WHEN month_n = 3 THEN active END) / cs.size, 0) AS "M3",
    ROUND(100.0 * MAX(CASE WHEN month_n = 6 THEN active END) / cs.size, 0) AS "M6"
FROM activity_agg aa
JOIN cohort_sizes cs ON cs.cohort_month = aa.cohort_month
GROUP BY aa.cohort_month, cs.size
ORDER BY aa.cohort_month;

Example 3: Revenue Cohort Analysis

Track cumulative revenue per cohort over time (LTV curves):

WITH customer_cohorts AS (
    SELECT customer_id,
           DATE_TRUNC('month', MIN(created_at))::DATE AS cohort_month
    FROM ec_orders GROUP BY customer_id
),

monthly_revenue AS (
    SELECT
        o.customer_id,
        cc.cohort_month,
        DATE_TRUNC('month', o.created_at)::DATE AS revenue_month,
        SUM(oi.quantity * oi.unit_price) AS revenue
    FROM ec_orders o
    JOIN ec_order_items oi ON oi.order_id = o.order_id
    JOIN customer_cohorts cc ON cc.customer_id = o.customer_id
    GROUP BY o.customer_id, cc.cohort_month, DATE_TRUNC('month', o.created_at)
),

cohort_revenue AS (
    SELECT
        cohort_month,
        (EXTRACT(YEAR  FROM revenue_month) - EXTRACT(YEAR  FROM cohort_month)) * 12 +
        (EXTRACT(MONTH FROM revenue_month) - EXTRACT(MONTH FROM cohort_month)) AS month_n,
        SUM(revenue) AS period_revenue
    FROM monthly_revenue
    GROUP BY cohort_month, month_n
),

with_cumulative AS (
    SELECT
        cohort_month,
        month_n,
        period_revenue,
        SUM(period_revenue) OVER (
            PARTITION BY cohort_month
            ORDER BY month_n
            ROWS UNBOUNDED PRECEDING
        ) AS cumulative_revenue
    FROM cohort_revenue
),

cohort_sizes AS (
    SELECT cohort_month, COUNT(*) AS size FROM customer_cohorts GROUP BY cohort_month
)

SELECT
    wc.cohort_month,
    cs.size,
    wc.month_n,
    ROUND(wc.cumulative_revenue, 2)            AS cumulative_revenue,
    ROUND(wc.cumulative_revenue / cs.size, 2)  AS ltv_per_customer
FROM with_cumulative wc
JOIN cohort_sizes cs ON cs.cohort_month = wc.cohort_month
ORDER BY wc.cohort_month, wc.month_n;

Example 4: Event-Based Cohorts (Stock Market Dataset)

Group companies by the year they first appeared in the stock prices dataset and compare their trading activity over time:

WITH company_cohorts AS (
    SELECT
        company_id,
        DATE_TRUNC('year', MIN(price_date))::DATE AS cohort_year
    FROM stock_prices
    GROUP BY company_id
),

annual_activity AS (
    SELECT
        sp.company_id,
        cc.cohort_year,
        DATE_TRUNC('year', sp.price_date)::DATE AS trading_year,
        COUNT(*)                                 AS trading_days,
        ROUND(AVG(sp.close_price), 2)            AS avg_close_price,
        ROUND(AVG(sp.volume), 0)                 AS avg_daily_volume
    FROM stock_prices sp
    JOIN company_cohorts cc ON cc.company_id = sp.company_id
    GROUP BY sp.company_id, cc.cohort_year, DATE_TRUNC('year', sp.price_date)
)

SELECT
    cohort_year,
    trading_year,
    COUNT(DISTINCT company_id)        AS active_companies,
    ROUND(AVG(trading_days), 0)       AS avg_trading_days,
    ROUND(AVG(avg_close_price), 2)    AS avg_close_price
FROM annual_activity
GROUP BY cohort_year, trading_year
ORDER BY cohort_year, trading_year;

Purchase this course to unlock the full lesson.

Sign up