nordabiz/scripts/backfill_zopk_images.py

#!/usr/bin/env python3
"""
Backfill ZOPK news images — download and cache locally.

Downloads images from original source URLs and saves them to
static/uploads/zopk/ so they can be served without cross-origin issues.

Usage:
    python3 scripts/backfill_zopk_images.py [--dry-run] [--limit N]
"""
import sys
import os
import time
import logging
import argparse

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from dotenv import load_dotenv
load_dotenv()

from database import SessionLocal, ZOPKNews
from sqlalchemy import or_

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Reuse scraper's session and image download logic
import requests

USER_AGENT = 'NordaBizBot/1.0 (+https://nordabiznes.pl/bot; kontakt@nordabiznes.pl)'
CACHE_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
                         'static', 'uploads', 'zopk')


def download_image(image_url, news_id, session):
    """Download image and save locally. Returns local path or None."""
    os.makedirs(CACHE_DIR, exist_ok=True)
    try:
        resp = session.get(image_url, timeout=10, stream=True)
        if resp.status_code != 200:
            logger.debug(f"  HTTP {resp.status_code}: {image_url[:80]}")
            return None

        content_type = resp.headers.get('Content-Type', '')
        if 'image' not in content_type and not any(
            image_url.lower().endswith(e) for e in ('.jpg', '.jpeg', '.png', '.webp', '.gif')
        ):
            logger.debug(f"  Not an image ({content_type}): {image_url[:80]}")
            return None

        ext = '.jpg'
        if '.png' in image_url.lower() or 'png' in content_type:
            ext = '.png'
        elif '.webp' in image_url.lower() or 'webp' in content_type:
            ext = '.webp'

        filename = f'{news_id}{ext}'
        filepath = os.path.join(CACHE_DIR, filename)

        max_size = 2 * 1024 * 1024
        size = 0
        with open(filepath, 'wb') as f:
            for chunk in resp.iter_content(chunk_size=8192):
                size += len(chunk)
                if size > max_size:
                    break
                f.write(chunk)

        if size > max_size:
            os.remove(filepath)
            return None
        if size < 500:
            os.remove(filepath)
            return None

        return f'/static/uploads/zopk/{filename}'

    except Exception as e:
        logger.debug(f"  Download error: {e}")
        return None


def fetch_og_image(url, session):
    """Fetch og:image URL from article page."""
    from bs4 import BeautifulSoup
    try:
        resp = session.get(url, timeout=10, allow_redirects=True)
        if resp.status_code != 200:
            return None
        soup = BeautifulSoup(resp.text[:50000], 'html.parser')
        og = soup.find('meta', property='og:image')
        if og and og.get('content', '').startswith('http'):
            return og['content'].strip()
        tw = soup.find('meta', attrs={'name': 'twitter:image'})
        if tw and tw.get('content', '').startswith('http'):
            return tw['content'].strip()
    except Exception:
        pass
    return None


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--dry-run', action='store_true')
    parser.add_argument('--limit', type=int, default=250)
    args = parser.parse_args()

    session = requests.Session()
    session.headers.update({
        'User-Agent': USER_AGENT,
        'Accept': 'image/*, text/html',
        'Accept-Language': 'pl-PL,pl;q=0.9',
    })

    db = SessionLocal()
    try:
        # Find articles that need local image caching
        articles = db.query(ZOPKNews).filter(
            ZOPKNews.status.in_(['approved', 'auto_approved']),
            or_(
                ZOPKNews.image_url.is_(None),
                ~ZOPKNews.image_url.like('/static/%')
            )
        ).order_by(ZOPKNews.published_at.desc()).limit(args.limit).all()

        logger.info(f"Found {len(articles)} articles needing local image cache")
        cached = 0
        failed = 0

        for article in articles:
            current_url = article.image_url or ''

            # Try current image_url first, then og:image from article page
            image_url = current_url if current_url.startswith('http') else None

            if not image_url:
                # Fetch og:image from article page
                image_url = fetch_og_image(article.url, session)
                time.sleep(1)

            if image_url:
                local_path = download_image(image_url, article.id, session)
                if local_path:
                    logger.info(f"  [{article.id}] Cached: {article.title[:50]}")
                    if not args.dry_run:
                        article.image_url = local_path
                    cached += 1
                else:
                    # If direct download failed, try og:image as fallback
                    if image_url == current_url:
                        og = fetch_og_image(article.url, session)
                        if og and og != image_url:
                            local_path = download_image(og, article.id, session)
                            if local_path:
                                logger.info(f"  [{article.id}] Cached (og:image fallback): {article.title[:50]}")
                                if not args.dry_run:
                                    article.image_url = local_path
                                cached += 1
                                time.sleep(0.5)
                                continue
                        time.sleep(1)
                    failed += 1
            else:
                failed += 1

            time.sleep(0.5)  # Rate limiting

        if not args.dry_run:
            db.commit()

        logger.info(f"\nSummary: {cached} cached locally, {failed} failed/skipped")
        if args.dry_run:
            logger.info("DRY RUN — no changes made")

    finally:
        db.close()


if __name__ == '__main__':
    main()