Data Validation and Anomaly Detection with CTEs

Data Validation and Anomaly Detection with CTEs

Before analysis, data must be trustworthy. CTEs are perfect for building multi-step validation pipelines that check for nulls, duplicates, referential integrity, outliers, and business rule violations — and produce clean audit reports.

The Validation Pipeline Pattern

Raw data
   → Check 1: Null/completeness checks
   → Check 2: Duplicate detection
   → Check 3: Referential integrity
   → Check 4: Range/business rule violations
   → Check 5: Statistical outliers
   → Summary report

Each check is a CTE. The final SELECT combines results.


Example 1: Comprehensive Data Audit (ShopMetrics)

WITH null_checks AS (
    -- Check for missing required fields
    SELECT
        'ec_orders' AS table_name,
        'null_customer_id' AS check_name,
        COUNT(*) AS violation_count
    FROM ec_orders WHERE customer_id IS NULL

    UNION ALL

    SELECT 'ec_order_items', 'null_product_id', COUNT(*)
    FROM ec_order_items WHERE product_id IS NULL

    UNION ALL

    SELECT 'ec_order_items', 'zero_or_negative_price', COUNT(*)
    FROM ec_order_items WHERE unit_price <= 0

    UNION ALL

    SELECT 'ec_order_items', 'zero_or_negative_quantity', COUNT(*)
    FROM ec_order_items WHERE quantity <= 0
),

duplicate_checks AS (
    SELECT
        'ec_orders' AS table_name,
        'duplicate_order_ids' AS check_name,
        COUNT(*) - COUNT(DISTINCT order_id) AS violation_count
    FROM ec_orders

    UNION ALL

    SELECT 'ec_customers', 'duplicate_emails', COUNT(*) - COUNT(DISTINCT email)
    FROM ec_customers
),

referential_checks AS (
    -- Order items without a matching order
    SELECT
        'ec_order_items' AS table_name,
        'orphan_items_no_order' AS check_name,
        COUNT(*) AS violation_count
    FROM ec_order_items oi
    WHERE NOT EXISTS (SELECT 1 FROM ec_orders o WHERE o.order_id = oi.order_id)

    UNION ALL

    -- Orders without a matching customer
    SELECT 'ec_orders', 'orphan_orders_no_customer', COUNT(*)
    FROM ec_orders o
    WHERE NOT EXISTS (SELECT 1 FROM ec_customers c WHERE c.customer_id = o.customer_id)
),

all_checks AS (
    SELECT * FROM null_checks
    UNION ALL
    SELECT * FROM duplicate_checks
    UNION ALL
    SELECT * FROM referential_checks
)

SELECT
    table_name,
    check_name,
    violation_count,
    CASE WHEN violation_count = 0 THEN '✓ PASS' ELSE '✗ FAIL' END AS status
FROM all_checks
ORDER BY violation_count DESC, table_name, check_name;

Example 2: Statistical Outlier Detection

Find ec_orders whose value is more than 3 standard deviations from the mean (potential data entry errors or fraud signals):

WITH order_totals AS (
    SELECT
        o.order_id,
        o.customer_id,
        o.created_at,
        SUM(oi.quantity * oi.unit_price) AS total
    FROM ec_orders o
    JOIN ec_order_items oi ON oi.order_id = o.order_id
    GROUP BY o.order_id, o.customer_id, o.created_at
),

stats AS (
    SELECT
        AVG(total)    AS mean_total,
        STDDEV(total) AS stddev_total
    FROM order_totals
),

outliers AS (
    SELECT
        ot.order_id,
        ot.customer_id,
        ot.total,
        s.mean_total,
        s.stddev_total,
        ABS(ot.total - s.mean_total) / NULLIF(s.stddev_total, 0) AS z_score
    FROM order_totals ot
    CROSS JOIN stats s
)

SELECT
    order_id,
    customer_id,
    ROUND(total, 2)         AS order_total,
    ROUND(mean_total::NUMERIC, 2)    AS mean,
    ROUND(stddev_total::NUMERIC, 2)  AS stddev,
    ROUND(z_score, 2)       AS z_score,
    CASE
        WHEN z_score > 3 THEN 'HIGH OUTLIER'
        WHEN z_score > 2 THEN 'MODERATE OUTLIER'
        ELSE 'NORMAL'
    END AS flag
FROM outliers
WHERE z_score > 2
ORDER BY z_score DESC;

Example 3: IQR-Based Outlier Detection (Robust Method)

Z-score is sensitive to outliers themselves. IQR (Interquartile Range) is more robust:

WITH order_totals AS (
    SELECT order_id, SUM(quantity * unit_price) AS total
    FROM ec_order_items GROUP BY order_id
),

quartiles AS (
    SELECT
        PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY total) AS q1,
        PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY total) AS q3
    FROM order_totals
),

bounds AS (
    SELECT
        q1,
        q3,
        q3 - q1              AS iqr,
        q1 - 1.5 * (q3 - q1) AS lower_fence,
        q3 + 1.5 * (q3 - q1) AS upper_fence
    FROM quartiles
)

SELECT
    ot.order_id,
    ROUND(ot.total, 2)                  AS order_total,
    ROUND(b.lower_fence::NUMERIC, 2)    AS lower_fence,
    ROUND(b.upper_fence::NUMERIC, 2)    AS upper_fence,
    CASE
        WHEN ot.total < b.lower_fence THEN 'TOO LOW'
        WHEN ot.total > b.upper_fence THEN 'TOO HIGH'
        ELSE 'NORMAL'
    END AS flag
FROM order_totals ot
CROSS JOIN bounds b
WHERE ot.total < b.lower_fence OR ot.total > b.upper_fence
ORDER BY ot.total DESC;

Example 4: Stock Price Validation (Finance/StockMarket)

Detect suspicious price movements (circuit breaker-style anomaly detection):

WITH daily_changes AS (
    SELECT
        company_id,
        price_date,
        close_price,
        LAG(close_price) OVER (PARTITION BY company_id ORDER BY price_date) AS prev_close,
        ROUND(
            100.0 * (close_price - LAG(close_price) OVER (PARTITION BY company_id ORDER BY price_date))
            / NULLIF(LAG(close_price) OVER (PARTITION BY company_id ORDER BY price_date), 0),
            2
        ) AS pct_change
    FROM stock_prices
),

anomalies AS (
    SELECT
        dc.company_id,
        c.ticker,
        c.name,
        dc.price_date,
        dc.prev_close,
        dc.close_price,
        dc.pct_change,
        CASE
            WHEN ABS(dc.pct_change) > 20 THEN 'EXTREME (>20%)'
            WHEN ABS(dc.pct_change) > 10 THEN 'HIGH (>10%)'
            ELSE 'MODERATE (>5%)'
        END AS severity
    FROM daily_changes dc
    JOIN companies c ON c.company_id = dc.company_id
    WHERE ABS(dc.pct_change) > 5
      AND dc.prev_close IS NOT NULL
)

SELECT * FROM anomalies
ORDER BY ABS(pct_change) DESC;

Purchase this course to unlock the full lesson.

Sign up