pandas Integration — SQL Results into DataFrames & Back
pandas Integration — SQL Results into DataFrames & Back
pandas is the most common destination for SQL query results in data analysis.
pd.read_sql turns any query into a DataFrame; DataFrame.to_sql writes results back.
import pandas as pd
import psycopg2
from sqlalchemy import create_engine
# ── Option 1: pd.read_sql with a psycopg2 connection ─────────────
conn = psycopg2.connect(host="127.0.0.1", dbname="shared_db",
user="your_db_user", password="your_db_password")
df = pd.read_sql("""
SELECT
u.username,
u.verified,
COUNT(t.id) AS tweet_count,
SUM(t.like_count) AS total_likes,
AVG(t.like_count) AS avg_likes
FROM tw_users u
JOIN tw_tweets t ON t.user_id = u.id
WHERE t.created_at >= NOW() - INTERVAL '30 days'
GROUP BY u.username, u.verified
HAVING COUNT(t.id) >= 5
ORDER BY total_likes DESC
LIMIT 50
""", conn)
print(df.dtypes)
print(df.head(10))
# ── Option 2: SQLAlchemy engine (required for to_sql) ─────────────
engine = create_engine(
"postgresql+psycopg2://your_db_user:your_db_password"
"@127.0.0.1/shared_db"
)
# Parameterised query with SQLAlchemy
from sqlalchemy import text
with engine.connect() as conn:
df = pd.read_sql(
text("SELECT * FROM tw_tweets WHERE user_id = :uid LIMIT 100"),
conn,
params={"uid": 42}
)
# ── Chunked reading for large tables ──────────────────────────────
chunk_iter = pd.read_sql(
"SELECT * FROM tw_tweets ORDER BY created_at",
engine,
chunksize=1000 # yields DataFrames of 1000 rows
)
totals = []
for chunk in chunk_iter:
chunk_agg = chunk.groupby("user_id")["like_count"].sum()
totals.append(chunk_agg)
result = pd.concat(totals).groupby(level=0).sum().sort_values(ascending=False)
print(result.head(10))