Step-by-Step Aggregation with CTEs
Step-by-Step Aggregation with CTEs
Complex aggregation problems often feel impossible to solve in a single query. CTEs let you break them into clear, verifiable steps — each one building on the last.
The Problem with Nested Aggregations
Suppose you want to find ec_customers whose average order value is above the overall average order value. You need two aggregation levels:
- Average order value per customer
- Average of those averages (or the global average)
In raw SQL this becomes a mess of subqueries. With CTEs, each step is explicit.
Example 1: Customers Above the Average Order Value
WITH order_totals AS (
-- Step 1: total per order
SELECT
o.order_id,
o.customer_id,
SUM(oi.quantity * oi.unit_price) AS order_total
FROM ec_orders o
JOIN ec_order_items oi ON oi.order_id = o.order_id
GROUP BY o.order_id, o.customer_id
),
customer_avg AS (
-- Step 2: average order value per customer
SELECT
customer_id,
AVG(order_total) AS avg_order_value,
COUNT(*) AS total_orders,
SUM(order_total) AS lifetime_value
FROM order_totals
GROUP BY customer_id
),
global_avg AS (
-- Step 3: global average across all ec_orders
SELECT AVG(order_total) AS global_avg_order
FROM order_totals
)
-- Step 4: filter ec_customers above the global average
SELECT
c.name AS customer,
ca.avg_order_value,
ca.total_orders,
ca.lifetime_value,
ROUND(ca.avg_order_value - ga.global_avg_order, 2) AS above_avg_by
FROM customer_avg ca
JOIN ec_customers c ON c.customer_id = ca.customer_id
CROSS JOIN global_avg ga
WHERE ca.avg_order_value > ga.global_avg_order
ORDER BY ca.avg_order_value DESC;
Key technique: CROSS JOIN global_avg — because global_avg returns a single row, this effectively attaches the global figure to every customer row without changing the row count.
Example 2: Product Category Performance vs Category Average
Which ec_products are outperforming their own category's average revenue?
WITH product_revenue AS (
-- revenue per product
SELECT
p.product_id,
p.name,
p.category,
SUM(oi.quantity * oi.unit_price) AS revenue
FROM ec_products p
JOIN ec_order_items oi ON oi.product_id = p.product_id
GROUP BY p.product_id, p.name, p.category
),
category_avg AS (
-- average revenue within each category
SELECT
category,
AVG(revenue) AS avg_category_revenue,
COUNT(*) AS products_in_category
FROM product_revenue
GROUP BY category
)
SELECT
pr.name,
pr.category,
pr.revenue,
ROUND(ca.avg_category_revenue, 2) AS category_avg,
ROUND(pr.revenue - ca.avg_category_revenue, 2) AS difference,
ROUND(100.0 * (pr.revenue - ca.avg_category_revenue) / ca.avg_category_revenue, 1) AS pct_above_avg
FROM product_revenue pr
JOIN category_avg ca ON ca.category = pr.category
WHERE pr.revenue > ca.avg_category_revenue
ORDER BY pct_above_avg DESC;
Example 3: Multi-Level Sales Funnel from Events
Using the ec_events table — how many users hit each stage, and what's the drop-off at each step?
WITH stage_users AS (
-- count distinct users per event type
SELECT
event_type,
COUNT(DISTINCT customer_id) AS users
FROM ec_events
GROUP BY event_type
),
funnel AS (
-- define the funnel order explicitly
SELECT
event_type,
users,
CASE event_type
WHEN 'page_view' THEN 1
WHEN 'add_to_cart' THEN 2
WHEN 'purchase' THEN 3
END AS funnel_step
FROM stage_users
WHERE event_type IN ('page_view', 'add_to_cart', 'purchase')
),
with_lag AS (
SELECT
funnel_step,
event_type,
users,
LAG(users) OVER (ORDER BY funnel_step) AS prev_users
FROM funnel
)
SELECT
funnel_step,
event_type,
users,
prev_users,
CASE
WHEN prev_users IS NULL THEN NULL
ELSE ROUND(100.0 * users / prev_users, 1)
END AS retention_pct
FROM with_lag
ORDER BY funnel_step;
This combines CTEs with a window function (LAG) for clean funnel analysis.
Example 4: Stock Portfolio Performance (StockMarket dataset)
Find which stocks in each sector are beating the sector's average 30-day return.
WITH recent_prices AS (
-- latest price per stock
SELECT DISTINCT ON (company_id)
company_id,
close_price AS latest_price,
price_date AS latest_date
FROM stock_prices
ORDER BY company_id, price_date DESC
),
prices_30d_ago AS (
-- price closest to 30 days ago
SELECT DISTINCT ON (company_id)
company_id,
close_price AS price_30d,
price_date
FROM stock_prices
WHERE price_date <= CURRENT_DATE - INTERVAL '30 days'
ORDER BY company_id, price_date DESC
),
stock_returns AS (
SELECT
c.company_id,
c.ticker,
c.name,
c.sector,
ROUND(100.0 * (rp.latest_price - p30.price_30d) / p30.price_30d, 2) AS return_30d_pct
FROM companies c
JOIN recent_prices rp ON rp.company_id = c.company_id
JOIN prices_30d_ago p30 ON p30.company_id = c.company_id
),
sector_avg AS (
SELECT
sector,
AVG(return_30d_pct) AS avg_sector_return
FROM stock_returns
GROUP BY sector
)
SELECT
sr.ticker,
sr.name,
sr.sector,
sr.return_30d_pct,
ROUND(sa.avg_sector_return, 2) AS sector_avg,
ROUND(sr.return_30d_pct - sa.avg_sector_return, 2) AS alpha
FROM stock_returns sr
JOIN sector_avg sa ON sa.sector = sr.sector
WHERE sr.return_30d_pct > sa.avg_sector_return
ORDER BY alpha DESC;