Batch Operations — executemany vs execute_values vs COPY
Batch Operations — executemany vs execute_values vs COPY
Inserting thousands of rows one at a time is slow. Three escalating approaches:
executemany (simple), execute_values (fast), and COPY FROM STDIN (fastest).
import psycopg2
from psycopg2.extras import execute_values
import io, csv, time
conn = psycopg2.connect(host="127.0.0.1", dbname="shared_db",
user="your_db_user", password="your_db_password")
cur = conn.cursor()
# Simulate 10,000 rows to insert
import random, string
def fake_rows(n):
return [
(
"bulk_" + "".join(random.choices(string.ascii_lowercase, k=8)),
f"Bulk User {i}",
random.choice(["New York","London","Tokyo","Remote"]),
random.randint(0, 50000),
)
for i in range(n)
]
rows = fake_rows(10_000)
# ── Method 1: executemany — one round-trip per row ────────────────
start = time.time()
cur.executemany("""
INSERT INTO tw_users
(username, display_name, location, follower_count)
VALUES (%s, %s, %s, %s)
ON CONFLICT DO NOTHING
""", rows)
conn.commit()
print(f"executemany: {time.time() - start:.2f}s")
# ── Method 2: execute_values — batches into multi-row VALUES ──────
rows = fake_rows(10_000)
start = time.time()
execute_values(cur, """
INSERT INTO tw_users
(username, display_name, location, follower_count)
VALUES %s
ON CONFLICT DO NOTHING
""", rows, page_size=500) # 500 rows per INSERT statement
conn.commit()
print(f"execute_values: {time.time() - start:.2f}s")
# ── Method 3: COPY FROM STDIN — fastest possible ──────────────────
rows = fake_rows(10_000)
start = time.time()
# Build an in-memory CSV buffer
buf = io.StringIO()
writer = csv.writer(buf)
for row in rows:
writer.writerow(row)
buf.seek(0)
cur.copy_expert("""
COPY tw_users
(username, display_name, location, follower_count)
FROM STDIN WITH (FORMAT csv)
""", buf)
conn.commit()
print(f"COPY FROM STDIN: {time.time() - start:.2f}s")