Bot Detection — Behavioural Signals in SQL
Bot Detection — Behavioural Signals in SQL
Bots have distinctive behavioural patterns: posting at perfectly regular intervals, unusually high tweet volume, zero engagement despite many followers, and posting only via API (not a real client). We score each user on multiple signals.
-- Bot signal scoring — pre-compute inter-tweet intervals via window
-- function in an inner CTE, then aggregate per user.
WITH tweet_intervals AS (
SELECT
user_id,
source,
created_at,
EXTRACT(EPOCH FROM (
created_at - LAG(created_at) OVER (
PARTITION BY user_id ORDER BY created_at
)
)) AS gap_secs
FROM tw_tweets
WHERE created_at >= NOW() - INTERVAL '30 days'
),
tweet_timing AS (
SELECT
user_id,
COUNT(*) AS tweet_count_30d,
AVG(gap_secs) AS avg_interval_secs,
STDDEV_SAMP(gap_secs) AS std_interval_secs,
COUNT(*) FILTER (WHERE source = 'API') AS api_tweets,
COUNT(*) FILTER (WHERE EXTRACT(HOUR FROM created_at) BETWEEN 2 AND 5)
AS late_night_tweets
FROM tweet_intervals
GROUP BY user_id
HAVING COUNT(*) >= 5
)
SELECT
u.username,
u.is_bot AS known_bot,
u.follower_count,
u.verified,
tt.tweet_count_30d,
ROUND(tt.avg_interval_secs / 60, 1) AS avg_minutes_between_tweets,
ROUND(tt.std_interval_secs / 60, 1) AS std_dev_minutes,
-- Low coefficient of variation = suspiciously regular posting
ROUND(tt.std_interval_secs / NULLIF(tt.avg_interval_secs, 0), 3)
AS interval_cv,
ROUND(100.0 * tt.api_tweets / NULLIF(tt.tweet_count_30d, 0), 1)
AS api_tweet_pct,
tt.late_night_tweets,
-- Bot score (0-100)
LEAST(100,
-- High tweet volume
CASE WHEN tt.tweet_count_30d > 300 THEN 30
WHEN tt.tweet_count_30d > 100 THEN 15
ELSE 0 END
-- Suspiciously regular intervals (CV < 0.3)
+ CASE WHEN tt.std_interval_secs / NULLIF(tt.avg_interval_secs,0) < 0.1 THEN 30
WHEN tt.std_interval_secs / NULLIF(tt.avg_interval_secs,0) < 0.3 THEN 15
ELSE 0 END
-- API-heavy posting
+ CASE WHEN tt.api_tweets::float / NULLIF(tt.tweet_count_30d,0) > 0.9 THEN 25
WHEN tt.api_tweets::float / NULLIF(tt.tweet_count_30d,0) > 0.5 THEN 10
ELSE 0 END
-- High late-night activity
+ CASE WHEN tt.late_night_tweets > tt.tweet_count_30d * 0.3 THEN 15
ELSE 0 END
) AS bot_score
FROM tweet_timing tt
JOIN tw_users u ON u.id = tt.user_id
ORDER BY bot_score DESC
LIMIT 25;