diff --git a/database/migrations/083_zopk_cleanup.sql b/database/migrations/083_zopk_cleanup.sql new file mode 100644 index 0000000..b5dc32f --- /dev/null +++ b/database/migrations/083_zopk_cleanup.sql @@ -0,0 +1,29 @@ +-- Migration 083: ZOPK data cleanup and normalization +-- Date: 2026-03-15 +-- Description: +-- 1. Normalize entity types (Polish → English) +-- 2. Clean up stale fetch jobs +-- 3. Grant permissions + +-- ============================================================ +-- 1. Normalize entity types (Polish → English) +-- ============================================================ + +UPDATE zopk_knowledge_entities SET entity_type = 'organization' WHERE entity_type = 'Organizacja'; +UPDATE zopk_knowledge_entities SET entity_type = 'place' WHERE entity_type = 'Lokalizacja'; +UPDATE zopk_knowledge_entities SET entity_type = 'person' WHERE entity_type = 'Osoba'; +UPDATE zopk_knowledge_entities SET entity_type = 'project' WHERE entity_type = 'Projekt'; +UPDATE zopk_knowledge_entities SET entity_type = 'company' WHERE entity_type = 'Dokument/Umowa'; +UPDATE zopk_knowledge_entities SET entity_type = 'organization' WHERE entity_type = 'Kraj/Narodowość'; +UPDATE zopk_knowledge_entities SET entity_type = 'technology' WHERE entity_type = 'Element techniczny'; + +-- ============================================================ +-- 2. Clean up stale fetch jobs (stuck in 'running' status) +-- ============================================================ + +UPDATE zopk_news_fetch_jobs +SET status = 'failed', + error_message = 'Automatycznie zakończony — utknął w statusie running', + completed_at = NOW() +WHERE status = 'running' +AND started_at < NOW() - INTERVAL '1 hour'; diff --git a/scripts/backfill_zopk_images.py b/scripts/backfill_zopk_images.py new file mode 100644 index 0000000..79ea63e --- /dev/null +++ b/scripts/backfill_zopk_images.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 +""" +Backfill ZOPK news image URLs. + +1. Decode Brave proxy URLs to original image URLs +2. Fetch og:image for scraped articles without images + +Usage: + python3 scripts/backfill_zopk_images.py [--dry-run] +""" +import sys +import os +import re +import base64 +import logging +import argparse + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from dotenv import load_dotenv +load_dotenv() + +from database import SessionLocal, ZOPKNews + +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + + +def decode_brave_proxy_url(proxy_url): + """Decode Brave Search proxy image URL to original source URL.""" + if not proxy_url or 'imgs.search.brave.com' not in proxy_url: + return None + try: + match = re.search(r'/g:ce/(.+)$', proxy_url) + if not match: + return None + encoded = match.group(1).replace('/', '') + padding = 4 - len(encoded) % 4 + if padding != 4: + encoded += '=' * padding + decoded = base64.urlsafe_b64decode(encoded).decode('utf-8', errors='ignore') + if decoded.startswith('http'): + return decoded + except Exception as e: + logger.debug(f"Decode failed: {e}") + return None + + +def fetch_og_image(url, timeout=10): + """Fetch og:image meta tag from a URL.""" + import requests + from bs4 import BeautifulSoup + try: + resp = requests.get(url, timeout=timeout, headers={ + 'User-Agent': 'NordaBizBot/1.0 (+https://nordabiznes.pl/bot)' + }, allow_redirects=True) + if resp.status_code != 200: + return None + soup = BeautifulSoup(resp.text[:50000], 'html.parser') + og = soup.find('meta', property='og:image') + if og and og.get('content', '').startswith('http'): + return og['content'].strip() + tw = soup.find('meta', attrs={'name': 'twitter:image'}) + if tw and tw.get('content', '').startswith('http'): + return tw['content'].strip() + except Exception as e: + logger.debug(f"og:image fetch failed for {url[:60]}: {e}") + return None + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--dry-run', action='store_true') + args = parser.parse_args() + + db = SessionLocal() + try: + # Step 1: Decode Brave proxy URLs + brave_articles = db.query(ZOPKNews).filter( + ZOPKNews.image_url.like('%imgs.search.brave.com%'), + ZOPKNews.status.in_(['approved', 'auto_approved']) + ).all() + + logger.info(f"Found {len(brave_articles)} articles with Brave proxy image URLs") + decoded_count = 0 + + for article in brave_articles: + original = decode_brave_proxy_url(article.image_url) + if original and original != article.image_url: + logger.info(f" [{article.id}] {article.title[:50]}") + logger.info(f" Brave: {article.image_url[:80]}...") + logger.info(f" Original: {original[:80]}") + if not args.dry_run: + article.image_url = original + decoded_count += 1 + + if not args.dry_run: + db.commit() + logger.info(f"Decoded {decoded_count} Brave proxy URLs") + + # Step 2: For articles with favicon-only images, try fetching og:image + favicon_articles = db.query(ZOPKNews).filter( + ZOPKNews.image_url.like('%google.com/s2/favicons%'), + ZOPKNews.status.in_(['approved', 'auto_approved']), + ZOPKNews.scrape_status == 'scraped' + ).all() + + logger.info(f"\nFound {len(favicon_articles)} articles with favicon-only images") + og_count = 0 + + for article in favicon_articles[:50]: # Limit to avoid too many requests + og_image = fetch_og_image(article.url) + if og_image: + logger.info(f" [{article.id}] og:image found: {og_image[:80]}") + if not args.dry_run: + article.image_url = og_image + og_count += 1 + import time + time.sleep(1) # Rate limiting + + if not args.dry_run: + db.commit() + logger.info(f"Updated {og_count} articles with og:image") + + logger.info(f"\nSummary: {decoded_count} Brave decoded, {og_count} og:image fetched") + if args.dry_run: + logger.info("DRY RUN - no changes made") + + finally: + db.close() + + +if __name__ == '__main__': + main() diff --git a/templates/zopk/index.html b/templates/zopk/index.html index a854ed5..aa666e6 100644 --- a/templates/zopk/index.html +++ b/templates/zopk/index.html @@ -1316,10 +1316,14 @@ {% for news in news_items %} {% if news.image_url %} - + + {% else %}
- +
{% endif %}
diff --git a/templates/zopk/news_list.html b/templates/zopk/news_list.html index 6ec9fc1..c44b1b6 100644 --- a/templates/zopk/news_list.html +++ b/templates/zopk/news_list.html @@ -179,10 +179,14 @@ {% for news in news_items %} {% if news.image_url %} - + + {% else %}
- +
{% endif %}
diff --git a/zopk_content_scraper.py b/zopk_content_scraper.py index 79a62e2..9504586 100644 --- a/zopk_content_scraper.py +++ b/zopk_content_scraper.py @@ -617,6 +617,26 @@ class ZOPKContentScraper: return text + def _extract_og_image(self, html: str) -> Optional[str]: + """Extract og:image URL from HTML meta tags.""" + try: + soup = BeautifulSoup(html, 'html.parser') + # Try og:image first + og = soup.find('meta', property='og:image') + if og and og.get('content'): + url = og['content'].strip() + if url.startswith('http') and len(url) < 1000: + return url + # Try twitter:image as fallback + tw = soup.find('meta', attrs={'name': 'twitter:image'}) + if tw and tw.get('content'): + url = tw['content'].strip() + if url.startswith('http') and len(url) < 1000: + return url + except Exception as e: + logger.debug(f"og:image extraction failed: {e}") + return None + def _count_words(self, text: str) -> int: """Count words in text.""" if not text: @@ -731,6 +751,15 @@ class ZOPKContentScraper: status='failed' ) + # Extract og:image for better thumbnails + og_image = self._extract_og_image(html) + if og_image: + # Replace Brave proxy or favicon URLs with real og:image + current_img = news.image_url or '' + if not current_img or 'imgs.search.brave.com' in current_img or 'google.com/s2/favicons' in current_img: + news.image_url = og_image + logger.info(f"Updated image_url from og:image for article {news_id}") + # Success - update database word_count = self._count_words(content) diff --git a/zopk_knowledge_service.py b/zopk_knowledge_service.py index f683dbe..81eb0a5 100644 --- a/zopk_knowledge_service.py +++ b/zopk_knowledge_service.py @@ -114,8 +114,9 @@ Zwróć JSON z następującą strukturą: "summary": "krótkie podsumowanie" }} -Typy faktów: investment, decision, event, statistic, partnership, milestone -Typy encji: company, person, place, organization, project""" +Typy faktów (TYLKO te angielskie nazwy): investment, decision, event, statistic, partnership, milestone +Typy encji (TYLKO te angielskie nazwy): company, person, place, organization, project +WAŻNE: Nigdy nie używaj polskich nazw typów (np. Organizacja, Lokalizacja, Osoba). Zawsze angielskie.""" # System prompt is now empty - the user prompt contains all necessary instructions EXTRACTION_SYSTEM_PROMPT = "" diff --git a/zopk_news_service.py b/zopk_news_service.py index 210a3ae..9ee85cf 100644 --- a/zopk_news_service.py +++ b/zopk_news_service.py @@ -22,6 +22,7 @@ Created: 2026-01-11 import os import re import time +import base64 import hashlib import logging import unicodedata @@ -951,6 +952,34 @@ class ZOPKNewsService: 'knowledge_entities_created': saved_count # Same as saved_new for now } + @staticmethod + def _decode_brave_image_url(proxy_url: Optional[str]) -> Optional[str]: + """Decode Brave Search proxy image URL to original source URL. + + Brave proxy URLs encode the original URL as base64 after '/g:ce/'. + Example: https://imgs.search.brave.com/.../g:ce/aHR0cHM6Ly9... → https://... + """ + if not proxy_url or 'imgs.search.brave.com' not in proxy_url: + return proxy_url + try: + # Extract base64 part after /g:ce/ + match = re.search(r'/g:ce/(.+)$', proxy_url) + if not match: + return proxy_url + encoded = match.group(1) + # Brave uses URL-safe base64 with path separators as line breaks + encoded = encoded.replace('/', '') + # Add padding + padding = 4 - len(encoded) % 4 + if padding != 4: + encoded += '=' * padding + decoded = base64.urlsafe_b64decode(encoded).decode('utf-8', errors='ignore') + if decoded.startswith('http'): + return decoded + except Exception: + pass + return proxy_url + def _search_brave_single(self, query: str) -> List[NewsItem]: """Search Brave API with a single query, with retry on 429""" if not self.brave_api_key: @@ -992,7 +1021,7 @@ class ZOPKNewsService: source_type='brave', source_id=f'brave_{query[:20]}', published_at=datetime.now(), - image_url=item.get('thumbnail', {}).get('src') + image_url=self._decode_brave_image_url(item.get('thumbnail', {}).get('src')) )) break # success elif response.status_code == 429: