fix: add local image caching for ZOPK news thumbnails
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
Source servers return 503 (Cloudflare) for cross-origin image requests from browsers. Solution: download and cache images server-side during scraping, serve from /static/uploads/zopk/. - Scraper now downloads og:image and stores locally during article scraping (max 2MB, supports jpg/png/webp) - Backfill script downloads images for all existing articles server-side - Template fallback shows domain initial letter when image unavailable Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
5ffeb80959
commit
172f2085db
@ -1,17 +1,16 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""
|
"""
|
||||||
Backfill ZOPK news image URLs.
|
Backfill ZOPK news images — download and cache locally.
|
||||||
|
|
||||||
1. Decode Brave proxy URLs to original image URLs
|
Downloads images from original source URLs and saves them to
|
||||||
2. Fetch og:image for scraped articles without images
|
static/uploads/zopk/ so they can be served without cross-origin issues.
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
python3 scripts/backfill_zopk_images.py [--dry-run]
|
python3 scripts/backfill_zopk_images.py [--dry-run] [--limit N]
|
||||||
"""
|
"""
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
import re
|
import time
|
||||||
import base64
|
|
||||||
import logging
|
import logging
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
@ -21,39 +20,72 @@ from dotenv import load_dotenv
|
|||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
from database import SessionLocal, ZOPKNews
|
from database import SessionLocal, ZOPKNews
|
||||||
|
from sqlalchemy import or_
|
||||||
|
|
||||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Reuse scraper's session and image download logic
|
||||||
|
import requests
|
||||||
|
|
||||||
def decode_brave_proxy_url(proxy_url):
|
USER_AGENT = 'NordaBizBot/1.0 (+https://nordabiznes.pl/bot; kontakt@nordabiznes.pl)'
|
||||||
"""Decode Brave Search proxy image URL to original source URL."""
|
CACHE_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
|
||||||
if not proxy_url or 'imgs.search.brave.com' not in proxy_url:
|
'static', 'uploads', 'zopk')
|
||||||
return None
|
|
||||||
|
|
||||||
|
def download_image(image_url, news_id, session):
|
||||||
|
"""Download image and save locally. Returns local path or None."""
|
||||||
|
os.makedirs(CACHE_DIR, exist_ok=True)
|
||||||
try:
|
try:
|
||||||
match = re.search(r'/g:ce/(.+)$', proxy_url)
|
resp = session.get(image_url, timeout=10, stream=True)
|
||||||
if not match:
|
if resp.status_code != 200:
|
||||||
|
logger.debug(f" HTTP {resp.status_code}: {image_url[:80]}")
|
||||||
return None
|
return None
|
||||||
encoded = match.group(1).replace('/', '')
|
|
||||||
padding = 4 - len(encoded) % 4
|
content_type = resp.headers.get('Content-Type', '')
|
||||||
if padding != 4:
|
if 'image' not in content_type and not any(
|
||||||
encoded += '=' * padding
|
image_url.lower().endswith(e) for e in ('.jpg', '.jpeg', '.png', '.webp', '.gif')
|
||||||
decoded = base64.urlsafe_b64decode(encoded).decode('utf-8', errors='ignore')
|
):
|
||||||
if decoded.startswith('http'):
|
logger.debug(f" Not an image ({content_type}): {image_url[:80]}")
|
||||||
return decoded
|
return None
|
||||||
|
|
||||||
|
ext = '.jpg'
|
||||||
|
if '.png' in image_url.lower() or 'png' in content_type:
|
||||||
|
ext = '.png'
|
||||||
|
elif '.webp' in image_url.lower() or 'webp' in content_type:
|
||||||
|
ext = '.webp'
|
||||||
|
|
||||||
|
filename = f'{news_id}{ext}'
|
||||||
|
filepath = os.path.join(CACHE_DIR, filename)
|
||||||
|
|
||||||
|
max_size = 2 * 1024 * 1024
|
||||||
|
size = 0
|
||||||
|
with open(filepath, 'wb') as f:
|
||||||
|
for chunk in resp.iter_content(chunk_size=8192):
|
||||||
|
size += len(chunk)
|
||||||
|
if size > max_size:
|
||||||
|
break
|
||||||
|
f.write(chunk)
|
||||||
|
|
||||||
|
if size > max_size:
|
||||||
|
os.remove(filepath)
|
||||||
|
return None
|
||||||
|
if size < 500:
|
||||||
|
os.remove(filepath)
|
||||||
|
return None
|
||||||
|
|
||||||
|
return f'/static/uploads/zopk/{filename}'
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug(f"Decode failed: {e}")
|
logger.debug(f" Download error: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def fetch_og_image(url, timeout=10):
|
def fetch_og_image(url, session):
|
||||||
"""Fetch og:image meta tag from a URL."""
|
"""Fetch og:image URL from article page."""
|
||||||
import requests
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
try:
|
try:
|
||||||
resp = requests.get(url, timeout=timeout, headers={
|
resp = session.get(url, timeout=10, allow_redirects=True)
|
||||||
'User-Agent': 'NordaBizBot/1.0 (+https://nordabiznes.pl/bot)'
|
|
||||||
}, allow_redirects=True)
|
|
||||||
if resp.status_code != 200:
|
if resp.status_code != 200:
|
||||||
return None
|
return None
|
||||||
soup = BeautifulSoup(resp.text[:50000], 'html.parser')
|
soup = BeautifulSoup(resp.text[:50000], 'html.parser')
|
||||||
@ -63,68 +95,83 @@ def fetch_og_image(url, timeout=10):
|
|||||||
tw = soup.find('meta', attrs={'name': 'twitter:image'})
|
tw = soup.find('meta', attrs={'name': 'twitter:image'})
|
||||||
if tw and tw.get('content', '').startswith('http'):
|
if tw and tw.get('content', '').startswith('http'):
|
||||||
return tw['content'].strip()
|
return tw['content'].strip()
|
||||||
except Exception as e:
|
except Exception:
|
||||||
logger.debug(f"og:image fetch failed for {url[:60]}: {e}")
|
pass
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument('--dry-run', action='store_true')
|
parser.add_argument('--dry-run', action='store_true')
|
||||||
|
parser.add_argument('--limit', type=int, default=250)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
session = requests.Session()
|
||||||
|
session.headers.update({
|
||||||
|
'User-Agent': USER_AGENT,
|
||||||
|
'Accept': 'image/*, text/html',
|
||||||
|
'Accept-Language': 'pl-PL,pl;q=0.9',
|
||||||
|
})
|
||||||
|
|
||||||
db = SessionLocal()
|
db = SessionLocal()
|
||||||
try:
|
try:
|
||||||
# Step 1: Decode Brave proxy URLs
|
# Find articles that need local image caching
|
||||||
brave_articles = db.query(ZOPKNews).filter(
|
articles = db.query(ZOPKNews).filter(
|
||||||
ZOPKNews.image_url.like('%imgs.search.brave.com%'),
|
|
||||||
ZOPKNews.status.in_(['approved', 'auto_approved'])
|
|
||||||
).all()
|
|
||||||
|
|
||||||
logger.info(f"Found {len(brave_articles)} articles with Brave proxy image URLs")
|
|
||||||
decoded_count = 0
|
|
||||||
|
|
||||||
for article in brave_articles:
|
|
||||||
original = decode_brave_proxy_url(article.image_url)
|
|
||||||
if original and original != article.image_url:
|
|
||||||
logger.info(f" [{article.id}] {article.title[:50]}")
|
|
||||||
logger.info(f" Brave: {article.image_url[:80]}...")
|
|
||||||
logger.info(f" Original: {original[:80]}")
|
|
||||||
if not args.dry_run:
|
|
||||||
article.image_url = original
|
|
||||||
decoded_count += 1
|
|
||||||
|
|
||||||
if not args.dry_run:
|
|
||||||
db.commit()
|
|
||||||
logger.info(f"Decoded {decoded_count} Brave proxy URLs")
|
|
||||||
|
|
||||||
# Step 2: For articles with favicon-only images, try fetching og:image
|
|
||||||
favicon_articles = db.query(ZOPKNews).filter(
|
|
||||||
ZOPKNews.image_url.like('%google.com/s2/favicons%'),
|
|
||||||
ZOPKNews.status.in_(['approved', 'auto_approved']),
|
ZOPKNews.status.in_(['approved', 'auto_approved']),
|
||||||
ZOPKNews.scrape_status == 'scraped'
|
or_(
|
||||||
).all()
|
ZOPKNews.image_url.is_(None),
|
||||||
|
~ZOPKNews.image_url.like('/static/%')
|
||||||
|
)
|
||||||
|
).order_by(ZOPKNews.published_at.desc()).limit(args.limit).all()
|
||||||
|
|
||||||
logger.info(f"\nFound {len(favicon_articles)} articles with favicon-only images")
|
logger.info(f"Found {len(articles)} articles needing local image cache")
|
||||||
og_count = 0
|
cached = 0
|
||||||
|
failed = 0
|
||||||
|
|
||||||
for article in favicon_articles[:50]: # Limit to avoid too many requests
|
for article in articles:
|
||||||
og_image = fetch_og_image(article.url)
|
current_url = article.image_url or ''
|
||||||
if og_image:
|
|
||||||
logger.info(f" [{article.id}] og:image found: {og_image[:80]}")
|
# Try current image_url first, then og:image from article page
|
||||||
if not args.dry_run:
|
image_url = current_url if current_url.startswith('http') else None
|
||||||
article.image_url = og_image
|
|
||||||
og_count += 1
|
if not image_url:
|
||||||
import time
|
# Fetch og:image from article page
|
||||||
time.sleep(1) # Rate limiting
|
image_url = fetch_og_image(article.url, session)
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
if image_url:
|
||||||
|
local_path = download_image(image_url, article.id, session)
|
||||||
|
if local_path:
|
||||||
|
logger.info(f" [{article.id}] Cached: {article.title[:50]}")
|
||||||
|
if not args.dry_run:
|
||||||
|
article.image_url = local_path
|
||||||
|
cached += 1
|
||||||
|
else:
|
||||||
|
# If direct download failed, try og:image as fallback
|
||||||
|
if image_url == current_url:
|
||||||
|
og = fetch_og_image(article.url, session)
|
||||||
|
if og and og != image_url:
|
||||||
|
local_path = download_image(og, article.id, session)
|
||||||
|
if local_path:
|
||||||
|
logger.info(f" [{article.id}] Cached (og:image fallback): {article.title[:50]}")
|
||||||
|
if not args.dry_run:
|
||||||
|
article.image_url = local_path
|
||||||
|
cached += 1
|
||||||
|
time.sleep(0.5)
|
||||||
|
continue
|
||||||
|
time.sleep(1)
|
||||||
|
failed += 1
|
||||||
|
else:
|
||||||
|
failed += 1
|
||||||
|
|
||||||
|
time.sleep(0.5) # Rate limiting
|
||||||
|
|
||||||
if not args.dry_run:
|
if not args.dry_run:
|
||||||
db.commit()
|
db.commit()
|
||||||
logger.info(f"Updated {og_count} articles with og:image")
|
|
||||||
|
|
||||||
logger.info(f"\nSummary: {decoded_count} Brave decoded, {og_count} og:image fetched")
|
logger.info(f"\nSummary: {cached} cached locally, {failed} failed/skipped")
|
||||||
if args.dry_run:
|
if args.dry_run:
|
||||||
logger.info("DRY RUN - no changes made")
|
logger.info("DRY RUN — no changes made")
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
db.close()
|
db.close()
|
||||||
|
|||||||
@ -637,6 +637,56 @@ class ZOPKContentScraper:
|
|||||||
logger.debug(f"og:image extraction failed: {e}")
|
logger.debug(f"og:image extraction failed: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def _download_and_cache_image(self, image_url: str, news_id: int) -> Optional[str]:
|
||||||
|
"""Download image and cache locally. Returns local static path or None."""
|
||||||
|
import os
|
||||||
|
cache_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'static', 'uploads', 'zopk')
|
||||||
|
os.makedirs(cache_dir, exist_ok=True)
|
||||||
|
|
||||||
|
try:
|
||||||
|
resp = self._session.get(image_url, timeout=10, stream=True)
|
||||||
|
if resp.status_code != 200:
|
||||||
|
logger.debug(f"Image download failed ({resp.status_code}): {image_url[:80]}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
content_type = resp.headers.get('Content-Type', '')
|
||||||
|
if 'image' not in content_type and not image_url.lower().endswith(('.jpg', '.jpeg', '.png', '.webp', '.gif')):
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Determine extension
|
||||||
|
ext = '.jpg'
|
||||||
|
if '.png' in image_url.lower() or 'png' in content_type:
|
||||||
|
ext = '.png'
|
||||||
|
elif '.webp' in image_url.lower() or 'webp' in content_type:
|
||||||
|
ext = '.webp'
|
||||||
|
|
||||||
|
filename = f'{news_id}{ext}'
|
||||||
|
filepath = os.path.join(cache_dir, filename)
|
||||||
|
|
||||||
|
# Download (max 2MB)
|
||||||
|
max_size = 2 * 1024 * 1024
|
||||||
|
size = 0
|
||||||
|
with open(filepath, 'wb') as f:
|
||||||
|
for chunk in resp.iter_content(chunk_size=8192):
|
||||||
|
size += len(chunk)
|
||||||
|
if size > max_size:
|
||||||
|
f.close()
|
||||||
|
os.remove(filepath)
|
||||||
|
logger.debug(f"Image too large (>{max_size}B): {image_url[:80]}")
|
||||||
|
return None
|
||||||
|
f.write(chunk)
|
||||||
|
|
||||||
|
if size < 500: # Too small, probably an error page
|
||||||
|
os.remove(filepath)
|
||||||
|
return None
|
||||||
|
|
||||||
|
logger.info(f"Cached image for news {news_id}: {filename} ({size} bytes)")
|
||||||
|
return f'/static/uploads/zopk/{filename}'
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"Image cache failed for news {news_id}: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
def _count_words(self, text: str) -> int:
|
def _count_words(self, text: str) -> int:
|
||||||
"""Count words in text."""
|
"""Count words in text."""
|
||||||
if not text:
|
if not text:
|
||||||
@ -751,14 +801,15 @@ class ZOPKContentScraper:
|
|||||||
status='failed'
|
status='failed'
|
||||||
)
|
)
|
||||||
|
|
||||||
# Extract og:image for better thumbnails
|
# Extract og:image and cache locally for reliable display
|
||||||
og_image = self._extract_og_image(html)
|
og_image = self._extract_og_image(html)
|
||||||
if og_image:
|
image_to_cache = og_image or news.image_url
|
||||||
# Replace Brave proxy or favicon URLs with real og:image
|
if image_to_cache and not (news.image_url or '').startswith('/static/'):
|
||||||
current_img = news.image_url or ''
|
local_path = self._download_and_cache_image(image_to_cache, news_id)
|
||||||
if not current_img or 'imgs.search.brave.com' in current_img or 'google.com/s2/favicons' in current_img:
|
if local_path:
|
||||||
|
news.image_url = local_path
|
||||||
|
elif og_image:
|
||||||
news.image_url = og_image
|
news.image_url = og_image
|
||||||
logger.info(f"Updated image_url from og:image for article {news_id}")
|
|
||||||
|
|
||||||
# Success - update database
|
# Success - update database
|
||||||
word_count = self._count_words(content)
|
word_count = self._count_words(content)
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user