nordabiz/scripts/fix_waterm_duplicates.py

38 lines
1.4 KiB
Python

"""One-time fix: remove duplicate company_websites for Waterm (company_id=12)"""
import os
from sqlalchemy import create_engine, text
engine = create_engine(os.environ['DATABASE_URL'])
with engine.connect() as conn:
rows = conn.execute(text(
"SELECT id, url, website_type, source, is_primary FROM company_websites "
"WHERE company_id = 12 ORDER BY id"
)).fetchall()
print("Before cleanup:")
for r in rows:
print(f" id={r[0]} url={r[1]} type={r[2]} source={r[3]} primary={r[4]}")
# Keep only the latest record per URL (prefer manual_edit)
keep = conn.execute(text(
"SELECT DISTINCT ON (url) id FROM company_websites "
"WHERE company_id = 12 "
"ORDER BY url, CASE WHEN source = 'manual_edit' THEN 0 ELSE 1 END, id DESC"
)).fetchall()
keep_ids = [r[0] for r in keep]
print(f"Keeping ids: {keep_ids}")
if keep_ids:
placeholders = ",".join(str(i) for i in keep_ids)
conn.execute(text(
f"DELETE FROM company_websites WHERE company_id = 12 AND id NOT IN ({placeholders})"
))
conn.commit()
rows2 = conn.execute(text(
"SELECT id, url, website_type, source, is_primary FROM company_websites "
"WHERE company_id = 12 ORDER BY id"
)).fetchall()
print("After cleanup:")
for r in rows2:
print(f" id={r[0]} url={r[1]} type={r[2]} source={r[3]} primary={r[4]}")