Bot Detection — Behavioural Signals in SQL

Bot Detection — Behavioural Signals in SQL

Bots have distinctive behavioural patterns: posting at perfectly regular intervals, unusually high tweet volume, zero engagement despite many followers, and posting only via API (not a real client). We score each user on multiple signals.

-- Bot signal scoring — pre-compute inter-tweet intervals via window
-- function in an inner CTE, then aggregate per user.
WITH tweet_intervals AS (
    SELECT
        user_id,
        source,
        created_at,
        EXTRACT(EPOCH FROM (
            created_at - LAG(created_at) OVER (
                PARTITION BY user_id ORDER BY created_at
            )
        )) AS gap_secs
    FROM tw_tweets
    WHERE created_at >= NOW() - INTERVAL '30 days'
),
tweet_timing AS (
    SELECT
        user_id,
        COUNT(*)                                        AS tweet_count_30d,
        AVG(gap_secs)                                   AS avg_interval_secs,
        STDDEV_SAMP(gap_secs)                           AS std_interval_secs,
        COUNT(*) FILTER (WHERE source = 'API')          AS api_tweets,
        COUNT(*) FILTER (WHERE EXTRACT(HOUR FROM created_at) BETWEEN 2 AND 5)
                                                        AS late_night_tweets
    FROM tweet_intervals
    GROUP BY user_id
    HAVING COUNT(*) >= 5
)
SELECT
    u.username,
    u.is_bot                                            AS known_bot,
    u.follower_count,
    u.verified,
    tt.tweet_count_30d,
    ROUND(tt.avg_interval_secs / 60, 1)                AS avg_minutes_between_tweets,
    ROUND(tt.std_interval_secs / 60, 1)                AS std_dev_minutes,
    -- Low coefficient of variation = suspiciously regular posting
    ROUND(tt.std_interval_secs / NULLIF(tt.avg_interval_secs, 0), 3)
                                                        AS interval_cv,
    ROUND(100.0 * tt.api_tweets / NULLIF(tt.tweet_count_30d, 0), 1)
                                                        AS api_tweet_pct,
    tt.late_night_tweets,
    -- Bot score (0-100)
    LEAST(100,
        -- High tweet volume
        CASE WHEN tt.tweet_count_30d > 300 THEN 30
             WHEN tt.tweet_count_30d > 100 THEN 15
             ELSE 0 END
        -- Suspiciously regular intervals (CV < 0.3)
        + CASE WHEN tt.std_interval_secs / NULLIF(tt.avg_interval_secs,0) < 0.1 THEN 30
               WHEN tt.std_interval_secs / NULLIF(tt.avg_interval_secs,0) < 0.3 THEN 15
               ELSE 0 END
        -- API-heavy posting
        + CASE WHEN tt.api_tweets::float / NULLIF(tt.tweet_count_30d,0) > 0.9 THEN 25
               WHEN tt.api_tweets::float / NULLIF(tt.tweet_count_30d,0) > 0.5 THEN 10
               ELSE 0 END
        -- High late-night activity
        + CASE WHEN tt.late_night_tweets > tt.tweet_count_30d * 0.3 THEN 15
               ELSE 0 END
    )                                                   AS bot_score
FROM tweet_timing tt
JOIN tw_users u ON u.id = tt.user_id
ORDER BY bot_score DESC
LIMIT 25;

Purchase this course to unlock the full lesson.

Sign up