nordabiz/scripts/backfill_zopk_images.py
Maciej Pienczyn 172f2085db
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
fix: add local image caching for ZOPK news thumbnails
Source servers return 503 (Cloudflare) for cross-origin image requests
from browsers. Solution: download and cache images server-side during
scraping, serve from /static/uploads/zopk/.

- Scraper now downloads og:image and stores locally during article
  scraping (max 2MB, supports jpg/png/webp)
- Backfill script downloads images for all existing articles server-side
- Template fallback shows domain initial letter when image unavailable

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-15 09:08:03 +01:00

182 lines
6.0 KiB
Python

#!/usr/bin/env python3
"""
Backfill ZOPK news images — download and cache locally.
Downloads images from original source URLs and saves them to
static/uploads/zopk/ so they can be served without cross-origin issues.
Usage:
python3 scripts/backfill_zopk_images.py [--dry-run] [--limit N]
"""
import sys
import os
import time
import logging
import argparse
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from dotenv import load_dotenv
load_dotenv()
from database import SessionLocal, ZOPKNews
from sqlalchemy import or_
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# Reuse scraper's session and image download logic
import requests
USER_AGENT = 'NordaBizBot/1.0 (+https://nordabiznes.pl/bot; kontakt@nordabiznes.pl)'
CACHE_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
'static', 'uploads', 'zopk')
def download_image(image_url, news_id, session):
"""Download image and save locally. Returns local path or None."""
os.makedirs(CACHE_DIR, exist_ok=True)
try:
resp = session.get(image_url, timeout=10, stream=True)
if resp.status_code != 200:
logger.debug(f" HTTP {resp.status_code}: {image_url[:80]}")
return None
content_type = resp.headers.get('Content-Type', '')
if 'image' not in content_type and not any(
image_url.lower().endswith(e) for e in ('.jpg', '.jpeg', '.png', '.webp', '.gif')
):
logger.debug(f" Not an image ({content_type}): {image_url[:80]}")
return None
ext = '.jpg'
if '.png' in image_url.lower() or 'png' in content_type:
ext = '.png'
elif '.webp' in image_url.lower() or 'webp' in content_type:
ext = '.webp'
filename = f'{news_id}{ext}'
filepath = os.path.join(CACHE_DIR, filename)
max_size = 2 * 1024 * 1024
size = 0
with open(filepath, 'wb') as f:
for chunk in resp.iter_content(chunk_size=8192):
size += len(chunk)
if size > max_size:
break
f.write(chunk)
if size > max_size:
os.remove(filepath)
return None
if size < 500:
os.remove(filepath)
return None
return f'/static/uploads/zopk/{filename}'
except Exception as e:
logger.debug(f" Download error: {e}")
return None
def fetch_og_image(url, session):
"""Fetch og:image URL from article page."""
from bs4 import BeautifulSoup
try:
resp = session.get(url, timeout=10, allow_redirects=True)
if resp.status_code != 200:
return None
soup = BeautifulSoup(resp.text[:50000], 'html.parser')
og = soup.find('meta', property='og:image')
if og and og.get('content', '').startswith('http'):
return og['content'].strip()
tw = soup.find('meta', attrs={'name': 'twitter:image'})
if tw and tw.get('content', '').startswith('http'):
return tw['content'].strip()
except Exception:
pass
return None
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--dry-run', action='store_true')
parser.add_argument('--limit', type=int, default=250)
args = parser.parse_args()
session = requests.Session()
session.headers.update({
'User-Agent': USER_AGENT,
'Accept': 'image/*, text/html',
'Accept-Language': 'pl-PL,pl;q=0.9',
})
db = SessionLocal()
try:
# Find articles that need local image caching
articles = db.query(ZOPKNews).filter(
ZOPKNews.status.in_(['approved', 'auto_approved']),
or_(
ZOPKNews.image_url.is_(None),
~ZOPKNews.image_url.like('/static/%')
)
).order_by(ZOPKNews.published_at.desc()).limit(args.limit).all()
logger.info(f"Found {len(articles)} articles needing local image cache")
cached = 0
failed = 0
for article in articles:
current_url = article.image_url or ''
# Try current image_url first, then og:image from article page
image_url = current_url if current_url.startswith('http') else None
if not image_url:
# Fetch og:image from article page
image_url = fetch_og_image(article.url, session)
time.sleep(1)
if image_url:
local_path = download_image(image_url, article.id, session)
if local_path:
logger.info(f" [{article.id}] Cached: {article.title[:50]}")
if not args.dry_run:
article.image_url = local_path
cached += 1
else:
# If direct download failed, try og:image as fallback
if image_url == current_url:
og = fetch_og_image(article.url, session)
if og and og != image_url:
local_path = download_image(og, article.id, session)
if local_path:
logger.info(f" [{article.id}] Cached (og:image fallback): {article.title[:50]}")
if not args.dry_run:
article.image_url = local_path
cached += 1
time.sleep(0.5)
continue
time.sleep(1)
failed += 1
else:
failed += 1
time.sleep(0.5) # Rate limiting
if not args.dry_run:
db.commit()
logger.info(f"\nSummary: {cached} cached locally, {failed} failed/skipped")
if args.dry_run:
logger.info("DRY RUN — no changes made")
finally:
db.close()
if __name__ == '__main__':
main()