Some checks are pending
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
- Fix broken news thumbnails by adding og:image extraction during content scraping (replaces Brave proxy URLs that block hotlinking) - Add image onerror fallback in templates showing domain favicon when original image fails to load - Decode Brave proxy image URLs to original source URLs before saving - Enforce English-only entity types in AI extraction prompt to prevent mixed Polish/English type names - Add migration 083 to normalize 14 existing Polish entity types and clean up 5 stale fetch jobs stuck in 'running' status - Add backfill script for existing articles with broken image URLs Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
135 lines
4.4 KiB
Python
135 lines
4.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Backfill ZOPK news image URLs.
|
|
|
|
1. Decode Brave proxy URLs to original image URLs
|
|
2. Fetch og:image for scraped articles without images
|
|
|
|
Usage:
|
|
python3 scripts/backfill_zopk_images.py [--dry-run]
|
|
"""
|
|
import sys
|
|
import os
|
|
import re
|
|
import base64
|
|
import logging
|
|
import argparse
|
|
|
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
|
from dotenv import load_dotenv
|
|
load_dotenv()
|
|
|
|
from database import SessionLocal, ZOPKNews
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def decode_brave_proxy_url(proxy_url):
|
|
"""Decode Brave Search proxy image URL to original source URL."""
|
|
if not proxy_url or 'imgs.search.brave.com' not in proxy_url:
|
|
return None
|
|
try:
|
|
match = re.search(r'/g:ce/(.+)$', proxy_url)
|
|
if not match:
|
|
return None
|
|
encoded = match.group(1).replace('/', '')
|
|
padding = 4 - len(encoded) % 4
|
|
if padding != 4:
|
|
encoded += '=' * padding
|
|
decoded = base64.urlsafe_b64decode(encoded).decode('utf-8', errors='ignore')
|
|
if decoded.startswith('http'):
|
|
return decoded
|
|
except Exception as e:
|
|
logger.debug(f"Decode failed: {e}")
|
|
return None
|
|
|
|
|
|
def fetch_og_image(url, timeout=10):
|
|
"""Fetch og:image meta tag from a URL."""
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
try:
|
|
resp = requests.get(url, timeout=timeout, headers={
|
|
'User-Agent': 'NordaBizBot/1.0 (+https://nordabiznes.pl/bot)'
|
|
}, allow_redirects=True)
|
|
if resp.status_code != 200:
|
|
return None
|
|
soup = BeautifulSoup(resp.text[:50000], 'html.parser')
|
|
og = soup.find('meta', property='og:image')
|
|
if og and og.get('content', '').startswith('http'):
|
|
return og['content'].strip()
|
|
tw = soup.find('meta', attrs={'name': 'twitter:image'})
|
|
if tw and tw.get('content', '').startswith('http'):
|
|
return tw['content'].strip()
|
|
except Exception as e:
|
|
logger.debug(f"og:image fetch failed for {url[:60]}: {e}")
|
|
return None
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('--dry-run', action='store_true')
|
|
args = parser.parse_args()
|
|
|
|
db = SessionLocal()
|
|
try:
|
|
# Step 1: Decode Brave proxy URLs
|
|
brave_articles = db.query(ZOPKNews).filter(
|
|
ZOPKNews.image_url.like('%imgs.search.brave.com%'),
|
|
ZOPKNews.status.in_(['approved', 'auto_approved'])
|
|
).all()
|
|
|
|
logger.info(f"Found {len(brave_articles)} articles with Brave proxy image URLs")
|
|
decoded_count = 0
|
|
|
|
for article in brave_articles:
|
|
original = decode_brave_proxy_url(article.image_url)
|
|
if original and original != article.image_url:
|
|
logger.info(f" [{article.id}] {article.title[:50]}")
|
|
logger.info(f" Brave: {article.image_url[:80]}...")
|
|
logger.info(f" Original: {original[:80]}")
|
|
if not args.dry_run:
|
|
article.image_url = original
|
|
decoded_count += 1
|
|
|
|
if not args.dry_run:
|
|
db.commit()
|
|
logger.info(f"Decoded {decoded_count} Brave proxy URLs")
|
|
|
|
# Step 2: For articles with favicon-only images, try fetching og:image
|
|
favicon_articles = db.query(ZOPKNews).filter(
|
|
ZOPKNews.image_url.like('%google.com/s2/favicons%'),
|
|
ZOPKNews.status.in_(['approved', 'auto_approved']),
|
|
ZOPKNews.scrape_status == 'scraped'
|
|
).all()
|
|
|
|
logger.info(f"\nFound {len(favicon_articles)} articles with favicon-only images")
|
|
og_count = 0
|
|
|
|
for article in favicon_articles[:50]: # Limit to avoid too many requests
|
|
og_image = fetch_og_image(article.url)
|
|
if og_image:
|
|
logger.info(f" [{article.id}] og:image found: {og_image[:80]}")
|
|
if not args.dry_run:
|
|
article.image_url = og_image
|
|
og_count += 1
|
|
import time
|
|
time.sleep(1) # Rate limiting
|
|
|
|
if not args.dry_run:
|
|
db.commit()
|
|
logger.info(f"Updated {og_count} articles with og:image")
|
|
|
|
logger.info(f"\nSummary: {decoded_count} Brave decoded, {og_count} og:image fetched")
|
|
if args.dry_run:
|
|
logger.info("DRY RUN - no changes made")
|
|
|
|
finally:
|
|
db.close()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|