Data Validation and Anomaly Detection with CTEs
Data Validation and Anomaly Detection with CTEs
Before analysis, data must be trustworthy. CTEs are perfect for building multi-step validation pipelines that check for nulls, duplicates, referential integrity, outliers, and business rule violations — and produce clean audit reports.
The Validation Pipeline Pattern
Raw data
→ Check 1: Null/completeness checks
→ Check 2: Duplicate detection
→ Check 3: Referential integrity
→ Check 4: Range/business rule violations
→ Check 5: Statistical outliers
→ Summary report
Each check is a CTE. The final SELECT combines results.
Example 1: Comprehensive Data Audit (ShopMetrics)
WITH null_checks AS (
-- Check for missing required fields
SELECT
'ec_orders' AS table_name,
'null_customer_id' AS check_name,
COUNT(*) AS violation_count
FROM ec_orders WHERE customer_id IS NULL
UNION ALL
SELECT 'ec_order_items', 'null_product_id', COUNT(*)
FROM ec_order_items WHERE product_id IS NULL
UNION ALL
SELECT 'ec_order_items', 'zero_or_negative_price', COUNT(*)
FROM ec_order_items WHERE unit_price <= 0
UNION ALL
SELECT 'ec_order_items', 'zero_or_negative_quantity', COUNT(*)
FROM ec_order_items WHERE quantity <= 0
),
duplicate_checks AS (
SELECT
'ec_orders' AS table_name,
'duplicate_order_ids' AS check_name,
COUNT(*) - COUNT(DISTINCT order_id) AS violation_count
FROM ec_orders
UNION ALL
SELECT 'ec_customers', 'duplicate_emails', COUNT(*) - COUNT(DISTINCT email)
FROM ec_customers
),
referential_checks AS (
-- Order items without a matching order
SELECT
'ec_order_items' AS table_name,
'orphan_items_no_order' AS check_name,
COUNT(*) AS violation_count
FROM ec_order_items oi
WHERE NOT EXISTS (SELECT 1 FROM ec_orders o WHERE o.order_id = oi.order_id)
UNION ALL
-- Orders without a matching customer
SELECT 'ec_orders', 'orphan_orders_no_customer', COUNT(*)
FROM ec_orders o
WHERE NOT EXISTS (SELECT 1 FROM ec_customers c WHERE c.customer_id = o.customer_id)
),
all_checks AS (
SELECT * FROM null_checks
UNION ALL
SELECT * FROM duplicate_checks
UNION ALL
SELECT * FROM referential_checks
)
SELECT
table_name,
check_name,
violation_count,
CASE WHEN violation_count = 0 THEN '✓ PASS' ELSE '✗ FAIL' END AS status
FROM all_checks
ORDER BY violation_count DESC, table_name, check_name;
Example 2: Statistical Outlier Detection
Find ec_orders whose value is more than 3 standard deviations from the mean (potential data entry errors or fraud signals):
WITH order_totals AS (
SELECT
o.order_id,
o.customer_id,
o.created_at,
SUM(oi.quantity * oi.unit_price) AS total
FROM ec_orders o
JOIN ec_order_items oi ON oi.order_id = o.order_id
GROUP BY o.order_id, o.customer_id, o.created_at
),
stats AS (
SELECT
AVG(total) AS mean_total,
STDDEV(total) AS stddev_total
FROM order_totals
),
outliers AS (
SELECT
ot.order_id,
ot.customer_id,
ot.total,
s.mean_total,
s.stddev_total,
ABS(ot.total - s.mean_total) / NULLIF(s.stddev_total, 0) AS z_score
FROM order_totals ot
CROSS JOIN stats s
)
SELECT
order_id,
customer_id,
ROUND(total, 2) AS order_total,
ROUND(mean_total::NUMERIC, 2) AS mean,
ROUND(stddev_total::NUMERIC, 2) AS stddev,
ROUND(z_score, 2) AS z_score,
CASE
WHEN z_score > 3 THEN 'HIGH OUTLIER'
WHEN z_score > 2 THEN 'MODERATE OUTLIER'
ELSE 'NORMAL'
END AS flag
FROM outliers
WHERE z_score > 2
ORDER BY z_score DESC;
Example 3: IQR-Based Outlier Detection (Robust Method)
Z-score is sensitive to outliers themselves. IQR (Interquartile Range) is more robust:
WITH order_totals AS (
SELECT order_id, SUM(quantity * unit_price) AS total
FROM ec_order_items GROUP BY order_id
),
quartiles AS (
SELECT
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY total) AS q1,
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY total) AS q3
FROM order_totals
),
bounds AS (
SELECT
q1,
q3,
q3 - q1 AS iqr,
q1 - 1.5 * (q3 - q1) AS lower_fence,
q3 + 1.5 * (q3 - q1) AS upper_fence
FROM quartiles
)
SELECT
ot.order_id,
ROUND(ot.total, 2) AS order_total,
ROUND(b.lower_fence::NUMERIC, 2) AS lower_fence,
ROUND(b.upper_fence::NUMERIC, 2) AS upper_fence,
CASE
WHEN ot.total < b.lower_fence THEN 'TOO LOW'
WHEN ot.total > b.upper_fence THEN 'TOO HIGH'
ELSE 'NORMAL'
END AS flag
FROM order_totals ot
CROSS JOIN bounds b
WHERE ot.total < b.lower_fence OR ot.total > b.upper_fence
ORDER BY ot.total DESC;
Example 4: Stock Price Validation (Finance/StockMarket)
Detect suspicious price movements (circuit breaker-style anomaly detection):
WITH daily_changes AS (
SELECT
company_id,
price_date,
close_price,
LAG(close_price) OVER (PARTITION BY company_id ORDER BY price_date) AS prev_close,
ROUND(
100.0 * (close_price - LAG(close_price) OVER (PARTITION BY company_id ORDER BY price_date))
/ NULLIF(LAG(close_price) OVER (PARTITION BY company_id ORDER BY price_date), 0),
2
) AS pct_change
FROM stock_prices
),
anomalies AS (
SELECT
dc.company_id,
c.ticker,
c.name,
dc.price_date,
dc.prev_close,
dc.close_price,
dc.pct_change,
CASE
WHEN ABS(dc.pct_change) > 20 THEN 'EXTREME (>20%)'
WHEN ABS(dc.pct_change) > 10 THEN 'HIGH (>10%)'
ELSE 'MODERATE (>5%)'
END AS severity
FROM daily_changes dc
JOIN companies c ON c.company_id = dc.company_id
WHERE ABS(dc.pct_change) > 5
AND dc.prev_close IS NOT NULL
)
SELECT * FROM anomalies
ORDER BY ABS(pct_change) DESC;