feat(zopk): Skrypt do pobierania obrazków dla newsów
Strategia pobierania obrazków: 1. Rozwiń URL Google News do oryginalnego źródła 2. Pobierz og:image z meta tagów strony 3. Fallback: logo domeny (Clearbit API) 4. Fallback: favicon (Google Favicon API) Użycie: python scripts/fetch_news_images.py [--dry-run] [--limit N] Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
82d4c870a0
commit
cf56fe7d8a
279
scripts/fetch_news_images.py
Normal file
279
scripts/fetch_news_images.py
Normal file
@ -0,0 +1,279 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Skrypt do pobierania obrazków dla newsów ZOPK.
|
||||
|
||||
Strategia:
|
||||
1. Rozwiń URL Google News do oryginalnego źródła
|
||||
2. Pobierz og:image z oryginalnego artykułu
|
||||
3. Jeśli brak og:image, użyj favicon domeny jako fallback
|
||||
|
||||
Użycie:
|
||||
python scripts/fetch_news_images.py --dry-run # Test bez zapisu
|
||||
python scripts/fetch_news_images.py # Produkcja
|
||||
python scripts/fetch_news_images.py --limit 10 # Ogranicz do 10 newsów
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
import argparse
|
||||
import requests
|
||||
from urllib.parse import urlparse, urljoin
|
||||
from bs4 import BeautifulSoup
|
||||
import time
|
||||
|
||||
# Dodaj ścieżkę projektu
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
# Konfiguracja bazy danych
|
||||
DATABASE_URL = os.getenv('DATABASE_URL', 'postgresql://nordabiz_app:CHANGE_ME@127.0.0.1:5432/nordabiz')
|
||||
|
||||
# User-Agent do requestów
|
||||
HEADERS = {
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
||||
'Accept-Language': 'pl,en;q=0.5',
|
||||
}
|
||||
|
||||
# Timeout dla requestów
|
||||
REQUEST_TIMEOUT = 10
|
||||
|
||||
|
||||
def resolve_google_news_url(google_url: str) -> str:
|
||||
"""
|
||||
Rozwiń URL Google News do oryginalnego źródła.
|
||||
Google News używa przekierowań, więc musimy podążyć za nimi.
|
||||
"""
|
||||
try:
|
||||
# Podążaj za przekierowaniami
|
||||
response = requests.head(google_url, headers=HEADERS, timeout=REQUEST_TIMEOUT, allow_redirects=True)
|
||||
final_url = response.url
|
||||
|
||||
# Czasem Google News daje jeszcze jeden poziom przekierowania
|
||||
if 'google.com' in final_url:
|
||||
response = requests.get(google_url, headers=HEADERS, timeout=REQUEST_TIMEOUT, allow_redirects=True)
|
||||
final_url = response.url
|
||||
|
||||
return final_url
|
||||
except Exception as e:
|
||||
print(f" ⚠ Nie można rozwinąć URL: {e}")
|
||||
return google_url
|
||||
|
||||
|
||||
def extract_og_image(url: str) -> str | None:
|
||||
"""
|
||||
Pobierz og:image z podanej strony.
|
||||
"""
|
||||
try:
|
||||
response = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
|
||||
response.raise_for_status()
|
||||
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
|
||||
# Szukaj og:image
|
||||
og_image = soup.find('meta', property='og:image')
|
||||
if og_image and og_image.get('content'):
|
||||
image_url = og_image['content']
|
||||
# Upewnij się że URL jest absolutny
|
||||
if not image_url.startswith('http'):
|
||||
image_url = urljoin(url, image_url)
|
||||
return image_url
|
||||
|
||||
# Fallback: twitter:image
|
||||
twitter_image = soup.find('meta', attrs={'name': 'twitter:image'})
|
||||
if twitter_image and twitter_image.get('content'):
|
||||
image_url = twitter_image['content']
|
||||
if not image_url.startswith('http'):
|
||||
image_url = urljoin(url, image_url)
|
||||
return image_url
|
||||
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f" ⚠ Nie można pobrać og:image: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def get_favicon_url(url: str) -> str:
|
||||
"""
|
||||
Pobierz URL favicona dla domeny używając Google Favicon API.
|
||||
"""
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
domain = parsed.netloc
|
||||
# Google Favicon API - zwraca wysokiej jakości favicon
|
||||
return f"https://www.google.com/s2/favicons?domain={domain}&sz=128"
|
||||
except:
|
||||
return None
|
||||
|
||||
|
||||
def get_domain_logo(url: str) -> str | None:
|
||||
"""
|
||||
Spróbuj pobrać logo domeny z Clearbit lub podobnego serwisu.
|
||||
"""
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
domain = parsed.netloc.replace('www.', '')
|
||||
# Clearbit Logo API
|
||||
logo_url = f"https://logo.clearbit.com/{domain}"
|
||||
|
||||
# Sprawdź czy logo istnieje
|
||||
response = requests.head(logo_url, timeout=5)
|
||||
if response.status_code == 200:
|
||||
return logo_url
|
||||
except:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def fetch_image_for_news(news_url: str) -> dict:
|
||||
"""
|
||||
Pobierz obrazek dla newsa. Zwraca dict z image_url i image_source.
|
||||
"""
|
||||
result = {
|
||||
'image_url': None,
|
||||
'image_source': None,
|
||||
'resolved_url': news_url
|
||||
}
|
||||
|
||||
# 1. Rozwiń URL jeśli to Google News
|
||||
if 'news.google.com' in news_url or 'google.com/rss' in news_url:
|
||||
print(f" → Rozwijanie URL Google News...")
|
||||
resolved_url = resolve_google_news_url(news_url)
|
||||
result['resolved_url'] = resolved_url
|
||||
print(f" → Rozwinięto do: {resolved_url[:80]}...")
|
||||
else:
|
||||
resolved_url = news_url
|
||||
|
||||
# 2. Spróbuj pobrać og:image
|
||||
print(f" → Pobieranie og:image...")
|
||||
og_image = extract_og_image(resolved_url)
|
||||
if og_image:
|
||||
result['image_url'] = og_image
|
||||
result['image_source'] = 'og:image'
|
||||
print(f" ✓ Znaleziono og:image")
|
||||
return result
|
||||
|
||||
# 3. Spróbuj logo domeny (Clearbit)
|
||||
print(f" → Szukanie logo domeny...")
|
||||
domain_logo = get_domain_logo(resolved_url)
|
||||
if domain_logo:
|
||||
result['image_url'] = domain_logo
|
||||
result['image_source'] = 'domain_logo'
|
||||
print(f" ✓ Znaleziono logo domeny")
|
||||
return result
|
||||
|
||||
# 4. Fallback: favicon
|
||||
print(f" → Używanie favicon jako fallback...")
|
||||
favicon = get_favicon_url(resolved_url)
|
||||
if favicon:
|
||||
result['image_url'] = favicon
|
||||
result['image_source'] = 'favicon'
|
||||
print(f" ✓ Użyto favicon")
|
||||
return result
|
||||
|
||||
print(f" ✗ Nie znaleziono żadnego obrazka")
|
||||
return result
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Pobierz obrazki dla newsów ZOPK')
|
||||
parser.add_argument('--dry-run', action='store_true', help='Tryb testowy - nie zapisuj do bazy')
|
||||
parser.add_argument('--limit', type=int, default=None, help='Ogranicz liczbę newsów do przetworzenia')
|
||||
parser.add_argument('--force', action='store_true', help='Nadpisz istniejące obrazki')
|
||||
args = parser.parse_args()
|
||||
|
||||
print("=" * 60)
|
||||
print("ZOPK News Image Fetcher")
|
||||
print("=" * 60)
|
||||
|
||||
if args.dry_run:
|
||||
print("🔍 TRYB TESTOWY - zmiany NIE będą zapisane\n")
|
||||
|
||||
# Połączenie z bazą
|
||||
engine = create_engine(DATABASE_URL)
|
||||
Session = sessionmaker(bind=engine)
|
||||
session = Session()
|
||||
|
||||
try:
|
||||
# Import modelu
|
||||
from database import ZOPKNews
|
||||
|
||||
# Pobierz newsy bez obrazków
|
||||
query = session.query(ZOPKNews).filter(
|
||||
ZOPKNews.status.in_(['approved', 'auto_approved'])
|
||||
)
|
||||
|
||||
if not args.force:
|
||||
query = query.filter(
|
||||
(ZOPKNews.image_url.is_(None)) | (ZOPKNews.image_url == '')
|
||||
)
|
||||
|
||||
query = query.order_by(ZOPKNews.published_at.desc())
|
||||
|
||||
if args.limit:
|
||||
query = query.limit(args.limit)
|
||||
|
||||
news_items = query.all()
|
||||
|
||||
print(f"📰 Znaleziono {len(news_items)} newsów do przetworzenia\n")
|
||||
|
||||
stats = {
|
||||
'processed': 0,
|
||||
'og_image': 0,
|
||||
'domain_logo': 0,
|
||||
'favicon': 0,
|
||||
'failed': 0
|
||||
}
|
||||
|
||||
for i, news in enumerate(news_items, 1):
|
||||
print(f"[{i}/{len(news_items)}] {news.title[:60]}...")
|
||||
|
||||
result = fetch_image_for_news(news.url)
|
||||
|
||||
if result['image_url']:
|
||||
stats['processed'] += 1
|
||||
stats[result['image_source']] = stats.get(result['image_source'], 0) + 1
|
||||
|
||||
if not args.dry_run:
|
||||
news.image_url = result['image_url']
|
||||
# Zapisz też resolved_url jeśli się zmienił
|
||||
if result['resolved_url'] != news.url and 'google.com' not in result['resolved_url']:
|
||||
# Można by zapisać oryginalny URL, ale zostawiamy jak jest
|
||||
pass
|
||||
session.commit()
|
||||
print(f" 💾 Zapisano do bazy\n")
|
||||
else:
|
||||
print(f" [DRY-RUN] Obrazek: {result['image_url'][:60]}...\n")
|
||||
else:
|
||||
stats['failed'] += 1
|
||||
print()
|
||||
|
||||
# Pauza między requestami żeby nie przeciążyć serwerów
|
||||
time.sleep(0.5)
|
||||
|
||||
print("=" * 60)
|
||||
print("PODSUMOWANIE")
|
||||
print("=" * 60)
|
||||
print(f"Przetworzono: {stats['processed']}")
|
||||
print(f" - og:image: {stats['og_image']}")
|
||||
print(f" - logo domeny: {stats['domain_logo']}")
|
||||
print(f" - favicon: {stats['favicon']}")
|
||||
print(f"Nieudane: {stats['failed']}")
|
||||
|
||||
if args.dry_run:
|
||||
print("\n⚠️ To był tryb testowy. Uruchom bez --dry-run aby zapisać zmiany.")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Błąd: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
session.rollback()
|
||||
finally:
|
||||
session.close()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Loading…
Reference in New Issue
Block a user