#!/usr/bin/env python3 """ Backfill ZOPK news images — download and cache locally. Downloads images from original source URLs and saves them to static/uploads/zopk/ so they can be served without cross-origin issues. Usage: python3 scripts/backfill_zopk_images.py [--dry-run] [--limit N] """ import sys import os import time import logging import argparse sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from dotenv import load_dotenv load_dotenv() from database import SessionLocal, ZOPKNews from sqlalchemy import or_ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) # Reuse scraper's session and image download logic import requests USER_AGENT = 'NordaBizBot/1.0 (+https://nordabiznes.pl/bot; kontakt@nordabiznes.pl)' CACHE_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'static', 'uploads', 'zopk') def download_image(image_url, news_id, session): """Download image and save locally. Returns local path or None.""" os.makedirs(CACHE_DIR, exist_ok=True) try: resp = session.get(image_url, timeout=10, stream=True) if resp.status_code != 200: logger.debug(f" HTTP {resp.status_code}: {image_url[:80]}") return None content_type = resp.headers.get('Content-Type', '') if 'image' not in content_type and not any( image_url.lower().endswith(e) for e in ('.jpg', '.jpeg', '.png', '.webp', '.gif') ): logger.debug(f" Not an image ({content_type}): {image_url[:80]}") return None ext = '.jpg' if '.png' in image_url.lower() or 'png' in content_type: ext = '.png' elif '.webp' in image_url.lower() or 'webp' in content_type: ext = '.webp' filename = f'{news_id}{ext}' filepath = os.path.join(CACHE_DIR, filename) max_size = 2 * 1024 * 1024 size = 0 with open(filepath, 'wb') as f: for chunk in resp.iter_content(chunk_size=8192): size += len(chunk) if size > max_size: break f.write(chunk) if size > max_size: os.remove(filepath) return None if size < 500: os.remove(filepath) return None return f'/static/uploads/zopk/{filename}' except Exception as e: logger.debug(f" Download error: {e}") return None def fetch_og_image(url, session): """Fetch og:image URL from article page.""" from bs4 import BeautifulSoup try: resp = session.get(url, timeout=10, allow_redirects=True) if resp.status_code != 200: return None soup = BeautifulSoup(resp.text[:50000], 'html.parser') og = soup.find('meta', property='og:image') if og and og.get('content', '').startswith('http'): return og['content'].strip() tw = soup.find('meta', attrs={'name': 'twitter:image'}) if tw and tw.get('content', '').startswith('http'): return tw['content'].strip() except Exception: pass return None def main(): parser = argparse.ArgumentParser() parser.add_argument('--dry-run', action='store_true') parser.add_argument('--limit', type=int, default=250) args = parser.parse_args() session = requests.Session() session.headers.update({ 'User-Agent': USER_AGENT, 'Accept': 'image/*, text/html', 'Accept-Language': 'pl-PL,pl;q=0.9', }) db = SessionLocal() try: # Find articles that need local image caching articles = db.query(ZOPKNews).filter( ZOPKNews.status.in_(['approved', 'auto_approved']), or_( ZOPKNews.image_url.is_(None), ~ZOPKNews.image_url.like('/static/%') ) ).order_by(ZOPKNews.published_at.desc()).limit(args.limit).all() logger.info(f"Found {len(articles)} articles needing local image cache") cached = 0 failed = 0 for article in articles: current_url = article.image_url or '' # Try current image_url first, then og:image from article page image_url = current_url if current_url.startswith('http') else None if not image_url: # Fetch og:image from article page image_url = fetch_og_image(article.url, session) time.sleep(1) if image_url: local_path = download_image(image_url, article.id, session) if local_path: logger.info(f" [{article.id}] Cached: {article.title[:50]}") if not args.dry_run: article.image_url = local_path cached += 1 else: # If direct download failed, try og:image as fallback if image_url == current_url: og = fetch_og_image(article.url, session) if og and og != image_url: local_path = download_image(og, article.id, session) if local_path: logger.info(f" [{article.id}] Cached (og:image fallback): {article.title[:50]}") if not args.dry_run: article.image_url = local_path cached += 1 time.sleep(0.5) continue time.sleep(1) failed += 1 else: failed += 1 time.sleep(0.5) # Rate limiting if not args.dry_run: db.commit() logger.info(f"\nSummary: {cached} cached locally, {failed} failed/skipped") if args.dry_run: logger.info("DRY RUN — no changes made") finally: db.close() if __name__ == '__main__': main()