Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
- Add nuclear keyword detection to news fetcher (_detect_nuclear_project_id) - New news matching nuclear keywords get project_id automatically - One-time script tags existing 24+ untagged nuclear news Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1737 lines
64 KiB
Python
1737 lines
64 KiB
Python
"""
|
|
ZOPK News Service
|
|
================
|
|
|
|
Multi-source news search and cross-verification for
|
|
Zielony Okręg Przemysłowy Kaszubia (ZOPK) knowledge base.
|
|
|
|
Sources:
|
|
- Brave Search API (web news)
|
|
- Google News RSS (aggregated news)
|
|
- Local media RSS feeds (trojmiasto.pl, dziennikbaltycki.pl)
|
|
|
|
Cross-verification:
|
|
- 1 source → pending (manual moderation required)
|
|
- 2 sources → pending with higher confidence
|
|
- 3+ sources → auto_approved (verified automatically)
|
|
|
|
Author: NordaBiz Development Team
|
|
Created: 2026-01-11
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import time
|
|
import base64
|
|
import hashlib
|
|
import logging
|
|
import unicodedata
|
|
from datetime import datetime, timedelta
|
|
from typing import List, Dict, Optional, Tuple
|
|
from dataclasses import dataclass
|
|
from urllib.parse import urlparse
|
|
|
|
import requests
|
|
import feedparser
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# ============================================================
|
|
# RSS FEED SOURCES
|
|
# ============================================================
|
|
|
|
RSS_SOURCES = {
|
|
# Local media
|
|
'trojmiasto': {
|
|
'url': 'https://www.trojmiasto.pl/rss/wiadomosci.xml',
|
|
'name': 'trojmiasto.pl',
|
|
'type': 'local_media',
|
|
'keywords': ['kaszubia', 'wejherowo', 'rumia', 'gdynia', 'pomorze', 'offshore', 'energia', 'przemysł', 'samsonowicz', 'kongsberg']
|
|
},
|
|
'dziennik_baltycki': {
|
|
'url': 'https://dziennikbaltycki.pl/rss/najnowsze.xml',
|
|
'name': 'Dziennik Bałtycki',
|
|
'type': 'local_media',
|
|
'keywords': ['kaszubia', 'wejherowo', 'rumia', 'gdynia', 'elektrownia', 'offshore', 'samsonowicz', 'kongsberg', 'lubiatowo']
|
|
},
|
|
# Government sources
|
|
'gov_mon': {
|
|
'url': 'https://www.gov.pl/web/obrona-narodowa/rss',
|
|
'name': 'Ministerstwo Obrony Narodowej',
|
|
'type': 'government',
|
|
'keywords': ['kongsberg', 'przemysł obronny', 'kaszubia', 'rumia', 'samsonowicz', 'inwestycje']
|
|
},
|
|
'gov_przemysl': {
|
|
'url': 'https://www.gov.pl/web/rozwoj-technologia/rss',
|
|
'name': 'Ministerstwo Rozwoju i Technologii',
|
|
'type': 'government',
|
|
'keywords': ['offshore', 'elektrownia jądrowa', 'centrum danych', 'wodór', 'transformacja']
|
|
},
|
|
# Google News aggregated searches
|
|
'google_news_zopk': {
|
|
'url': 'https://news.google.com/rss/search?q=Zielony+Okr%C4%99g+Przemys%C5%82owy+Kaszubia&hl=pl&gl=PL&ceid=PL:pl',
|
|
'name': 'Google News',
|
|
'type': 'aggregator',
|
|
'keywords': [] # No filtering, query-based
|
|
},
|
|
'google_news_offshore': {
|
|
'url': 'https://news.google.com/rss/search?q=offshore+Polska+Baltyk&hl=pl&gl=PL&ceid=PL:pl',
|
|
'name': 'Google News',
|
|
'type': 'aggregator',
|
|
'keywords': []
|
|
},
|
|
'google_news_nuclear': {
|
|
'url': 'https://news.google.com/rss/search?q=elektrownia+jadrowa+Polska+Lubiatowo&hl=pl&gl=PL&ceid=PL:pl',
|
|
'name': 'Google News',
|
|
'type': 'aggregator',
|
|
'keywords': []
|
|
},
|
|
'google_news_samsonowicz': {
|
|
'url': 'https://news.google.com/rss/search?q=Maciej+Samsonowicz+MON&hl=pl&gl=PL&ceid=PL:pl',
|
|
'name': 'Google News',
|
|
'type': 'aggregator',
|
|
'keywords': []
|
|
},
|
|
'google_news_kongsberg': {
|
|
'url': 'https://news.google.com/rss/search?q=Kongsberg+Polska+Rumia&hl=pl&gl=PL&ceid=PL:pl',
|
|
'name': 'Google News',
|
|
'type': 'aggregator',
|
|
'keywords': []
|
|
},
|
|
# Business/local organizations (via Google News)
|
|
'google_news_norda': {
|
|
'url': 'https://news.google.com/rss/search?q=Norda+Biznes+Wejherowo&hl=pl&gl=PL&ceid=PL:pl',
|
|
'name': 'Google News',
|
|
'type': 'aggregator',
|
|
'keywords': []
|
|
},
|
|
'google_news_spoko': {
|
|
'url': 'https://news.google.com/rss/search?q=Spoko+Gospodarcze+Pomorze&hl=pl&gl=PL&ceid=PL:pl',
|
|
'name': 'Google News',
|
|
'type': 'aggregator',
|
|
'keywords': []
|
|
},
|
|
# Regional media (via Google News - site-specific searches)
|
|
'google_news_norda_fm': {
|
|
'url': 'https://news.google.com/rss/search?q=site:nordafm.pl+OR+%22Norda+FM%22&hl=pl&gl=PL&ceid=PL:pl',
|
|
'name': 'Norda FM',
|
|
'type': 'local_media',
|
|
'keywords': []
|
|
},
|
|
'google_news_ttm': {
|
|
'url': 'https://news.google.com/rss/search?q=site:ttm24.pl+OR+%22Twoja+Telewizja+Morska%22&hl=pl&gl=PL&ceid=PL:pl',
|
|
'name': 'Twoja Telewizja Morska',
|
|
'type': 'local_media',
|
|
'keywords': []
|
|
},
|
|
'google_news_nadmorski24': {
|
|
'url': 'https://news.google.com/rss/search?q=site:nadmorski24.pl&hl=pl&gl=PL&ceid=PL:pl',
|
|
'name': 'Nadmorski24.pl',
|
|
'type': 'local_media',
|
|
'keywords': []
|
|
},
|
|
# Facebook - Maciej Samsonowicz (via Google search - FB doesn't have RSS)
|
|
'google_news_samsonowicz_fb': {
|
|
'url': 'https://news.google.com/rss/search?q=%22Maciej+Samsonowicz%22+facebook&hl=pl&gl=PL&ceid=PL:pl',
|
|
'name': 'Google News (Facebook Samsonowicz)',
|
|
'type': 'aggregator',
|
|
'keywords': []
|
|
}
|
|
}
|
|
|
|
# ============================================================
|
|
# BRAVE SEARCH - PRECYZYJNE ZAPYTANIA (zamiast jednego ogólnego)
|
|
# ============================================================
|
|
|
|
BRAVE_QUERIES = [
|
|
# ============================================================
|
|
# GRUPA 1: ZOPK BEZPOŚREDNIO (najwyższy priorytet)
|
|
# ============================================================
|
|
{
|
|
'query': '"Zielony Okręg Przemysłowy" OR "ZOPK Kaszubia"',
|
|
'weight': 5,
|
|
'description': 'ZOPK - bezpośrednie wzmianki'
|
|
},
|
|
{
|
|
'query': '"Maciej Samsonowicz" MON OR przemysł obronny',
|
|
'weight': 5,
|
|
'description': 'Samsonowicz - koordynator ZOPK'
|
|
},
|
|
|
|
# ============================================================
|
|
# GRUPA 2: PRZEMYSŁ OBRONNY
|
|
# ============================================================
|
|
{
|
|
'query': '"Kongsberg" "Rumia" OR "Kongsberg Defence Poland"',
|
|
'weight': 5,
|
|
'description': 'Kongsberg Rumia'
|
|
},
|
|
|
|
# ============================================================
|
|
# GRUPA 3: OFFSHORE WIND - projekty i firmy
|
|
# ============================================================
|
|
{
|
|
'query': '"Baltic Power" OR "Orsted Polska" offshore',
|
|
'weight': 5,
|
|
'description': 'Baltic Power / Orsted'
|
|
},
|
|
{
|
|
'query': '"Baltica" Equinor offshore OR "Baltica 2" "Baltica 3"',
|
|
'weight': 4,
|
|
'description': 'Baltica - Equinor/Polenergia'
|
|
},
|
|
{
|
|
'query': '"F.E.W. Baltic" OR "RWE" offshore Bałtyk wiatrowa',
|
|
'weight': 4,
|
|
'description': 'F.E.W. Baltic / RWE'
|
|
},
|
|
|
|
# ============================================================
|
|
# GRUPA 4: OFFSHORE WIND - infrastruktura i łańcuch dostaw
|
|
# ============================================================
|
|
{
|
|
'query': '"port instalacyjny" offshore OR "hub serwisowy" wiatrowa Gdynia',
|
|
'weight': 5,
|
|
'description': 'Porty offshore'
|
|
},
|
|
{
|
|
'query': '"CRIST" offshore OR "Remontowa Shipbuilding" wiatrowa',
|
|
'weight': 4,
|
|
'description': 'Stocznie dla offshore'
|
|
},
|
|
{
|
|
'query': '"ST3 Offshore" OR "GSG Towers" wieże wiatrowe',
|
|
'weight': 3,
|
|
'description': 'Producenci konstrukcji'
|
|
},
|
|
|
|
# ============================================================
|
|
# GRUPA 5: ELEKTROWNIA JĄDROWA
|
|
# ============================================================
|
|
{
|
|
'query': '"elektrownia jądrowa" "Lubiatowo" OR "Choczewo" OR "Kopalino"',
|
|
'weight': 5,
|
|
'description': 'EJ Lubiatowo-Kopalino'
|
|
},
|
|
{
|
|
'query': '"Polskie Elektrownie Jądrowe" OR "PEJ" atom',
|
|
'weight': 5,
|
|
'description': 'PEJ - spółka'
|
|
},
|
|
{
|
|
'query': '"Westinghouse" Polska OR "AP1000" elektrownia',
|
|
'weight': 5,
|
|
'description': 'Westinghouse - technologia'
|
|
},
|
|
{
|
|
'query': '"Bechtel" Polska atom OR elektrownia jądrowa',
|
|
'weight': 4,
|
|
'description': 'Bechtel - wykonawca'
|
|
},
|
|
|
|
# ============================================================
|
|
# GRUPA 6: SMR (Małe Reaktory Modularne)
|
|
# ============================================================
|
|
{
|
|
'query': '"SMR" Polska OR "Orlen Synthos Green Energy" reaktor',
|
|
'weight': 4,
|
|
'description': 'SMR - małe reaktory'
|
|
},
|
|
{
|
|
'query': '"BWRX-300" OR "GE Hitachi" Polska atom',
|
|
'weight': 4,
|
|
'description': 'BWRX-300 / GE Hitachi'
|
|
},
|
|
|
|
# ============================================================
|
|
# GRUPA 7: WODÓR I NOWE TECHNOLOGIE
|
|
# ============================================================
|
|
{
|
|
'query': '"Dolina Wodorowa" Pomorze OR "H2Gdańsk"',
|
|
'weight': 4,
|
|
'description': 'Dolina Wodorowa'
|
|
},
|
|
{
|
|
'query': '"wodór zielony" Gdańsk OR Gdynia OR Pomorze',
|
|
'weight': 3,
|
|
'description': 'Wodór zielony Pomorze'
|
|
},
|
|
{
|
|
'query': '"centrum danych" Gdynia OR "data center" Pomorze',
|
|
'weight': 4,
|
|
'description': 'Centra danych'
|
|
},
|
|
|
|
# ============================================================
|
|
# GRUPA 8: STREFY EKONOMICZNE I SAMORZĄDY
|
|
# ============================================================
|
|
{
|
|
'query': '"Rumia Invest Park" OR "strefa ekonomiczna Rumia"',
|
|
'weight': 4,
|
|
'description': 'Rumia Invest Park'
|
|
},
|
|
{
|
|
'query': '"gmina Choczewo" atom OR inwestycje',
|
|
'weight': 4,
|
|
'description': 'Gmina Choczewo'
|
|
},
|
|
{
|
|
'query': '"gmina Krokowa" OR "powiat pucki" offshore energia',
|
|
'weight': 3,
|
|
'description': 'Samorządy lokalne'
|
|
},
|
|
|
|
# ============================================================
|
|
# GRUPA 9: PORTY I LOGISTYKA
|
|
# ============================================================
|
|
{
|
|
'query': '"Port Gdynia" offshore OR inwestycje terminal',
|
|
'weight': 4,
|
|
'description': 'Port Gdynia'
|
|
},
|
|
{
|
|
'query': '"Port Gdańsk" offshore OR "DCT" inwestycje',
|
|
'weight': 3,
|
|
'description': 'Port Gdańsk / DCT'
|
|
},
|
|
|
|
# ============================================================
|
|
# GRUPA 10: ENERGETYKA LOKALNA
|
|
# ============================================================
|
|
{
|
|
'query': '"Energa" offshore OR "Energa" inwestycje Pomorze',
|
|
'weight': 3,
|
|
'description': 'Energa - lokalny operator'
|
|
},
|
|
|
|
# ============================================================
|
|
# GRUPA 11: WYDARZENIA BRANŻOWE
|
|
# ============================================================
|
|
{
|
|
'query': '"Offshore Wind Poland" konferencja OR "PSEW" wiatrowa',
|
|
'weight': 3,
|
|
'description': 'Konferencje offshore'
|
|
},
|
|
{
|
|
'query': '"Forum Energii" Pomorze OR "WindEurope" Polska',
|
|
'weight': 3,
|
|
'description': 'Eventy energetyczne'
|
|
}
|
|
]
|
|
|
|
# ============================================================
|
|
# BLACKLISTA DOMEN - automatyczne odrzucanie
|
|
# ============================================================
|
|
|
|
BLACKLISTED_DOMAINS = {
|
|
# Sport
|
|
'sport.pl', 'meczyki.pl', 'sportowefakty.wp.pl', 'przegladsportowy.pl',
|
|
'sport.tvp.pl', 'goal.pl', 'sportbuzz.pl', 'pilkanozna.pl',
|
|
# Plotki i lifestyle
|
|
'pudelek.pl', 'plotek.pl', 'pomponik.pl', 'kozaczek.pl', 'jastrząbpost.pl',
|
|
'plejada.pl', 'party.pl', 'viva.pl', 'gala.pl',
|
|
# Ogólne newsy bez kontekstu lokalnego
|
|
'se.pl', 'fakt.pl', 'natemat.pl',
|
|
# Inne nieistotne
|
|
'pogoda.interia.pl', 'allegro.pl', 'olx.pl', 'pracuj.pl',
|
|
'gratka.pl', 'otodom.pl', 'otomoto.pl',
|
|
# Zagraniczne
|
|
'reuters.com', 'bbc.com', 'cnn.com', 'theguardian.com'
|
|
}
|
|
|
|
# Domeny preferowane (bonus do oceny)
|
|
PREFERRED_DOMAINS = {
|
|
'trojmiasto.pl': 2, 'dziennikbaltycki.pl': 2, 'nordafm.pl': 3,
|
|
'ttm24.pl': 3, 'nadmorski24.pl': 2, 'gdynia.pl': 2,
|
|
'wejherowo.pl': 2, 'rumia.eu': 2, 'gov.pl': 1,
|
|
'biznes.gov.pl': 2, 'wnp.pl': 1, 'wysokienapiecie.pl': 2,
|
|
'energetyka24.com': 2, 'defence24.pl': 2, 'gospodarkamorska.pl': 2
|
|
}
|
|
|
|
# ============================================================
|
|
# ZOPK KEYWORDS - słowa kluczowe do pre-filtrowania
|
|
# ============================================================
|
|
|
|
# ZOPK-related keywords for filtering (rozszerzone i pogrupowane)
|
|
ZOPK_KEYWORDS_CRITICAL = [
|
|
# MUST HAVE - bezpośrednie trafienia (wystarczy 1) → score 5
|
|
'zielony okręg przemysłowy', 'zopk',
|
|
# Kongsberg
|
|
'kongsberg rumia', 'kongsberg defence', 'kongsberg poland',
|
|
# Osoba kluczowa
|
|
'maciej samsonowicz', 'samsonowicz mon',
|
|
# Elektrownia jądrowa - lokalizacje
|
|
'lubiatowo kopalino', 'elektrownia jądrowa lubiatowo', 'elektrownia jądrowa choczewo',
|
|
# Główne projekty offshore
|
|
'baltic power', 'baltica offshore', 'baltica 2', 'baltica 3',
|
|
# Strefa ekonomiczna
|
|
'rumia invest park',
|
|
# PEJ
|
|
'polskie elektrownie jądrowe', 'pej lubiatowo',
|
|
# Westinghouse/Bechtel
|
|
'westinghouse polska', 'ap1000 polska', 'bechtel polska',
|
|
# Port instalacyjny
|
|
'port instalacyjny offshore'
|
|
]
|
|
|
|
ZOPK_KEYWORDS_STRONG = [
|
|
# STRONG - mocne powiązania (wystarczy 1) → score 4
|
|
# Offshore wind
|
|
'offshore bałtyk', 'farma wiatrowa bałtyk', 'morska energetyka wiatrowa',
|
|
'orsted polska', 'equinor polska', 'rwe offshore', 'few baltic', 'ocean winds',
|
|
'hub serwisowy offshore',
|
|
# Stocznie dla offshore
|
|
'crist offshore', 'remontowa shipbuilding', 'st3 offshore', 'gsg towers',
|
|
# Atom - wykonawcy i technologia
|
|
'kongsberg polska', 'bwrx-300', 'ge hitachi polska',
|
|
# SMR
|
|
'orlen synthos', 'smr polska', 'małe reaktory modularne',
|
|
# Przemysł obronny
|
|
'przemysł obronny pomorze',
|
|
# Wodór
|
|
'dolina wodorowa', 'h2gdańsk', 'wodór zielony gdańsk', 'wodór zielony gdynia',
|
|
'laboratoria wodorowe',
|
|
# Data center
|
|
'centrum danych gdynia', 'data center gdynia',
|
|
# Samorządy
|
|
'gmina choczewo', 'gmina krokowa', 'powiat pucki',
|
|
# Porty
|
|
'port gdynia offshore', 'terminal offshore gdynia',
|
|
# Osoby
|
|
'kosiniak-kamysz przemysł',
|
|
# Transformacja
|
|
'transformacja energetyczna pomorze'
|
|
]
|
|
|
|
ZOPK_KEYWORDS_WEAK = [
|
|
# WEAK - słabe powiązania (potrzeba 2+ lub w połączeniu z lokalizacją) → score 2-3
|
|
'offshore wind', 'elektrownia jądrowa', 'przemysł obronny', 'przemysł zbrojeniowy',
|
|
'inwestycje przemysłowe', 'strefa ekonomiczna', 'centrum danych', 'data center',
|
|
'farma wiatrowa', 'energia odnawialna', 'atom polska', 'energetyka jądrowa',
|
|
'morskie wiatrowe', 'turbiny wiatrowe', 'fundamenty offshore', 'monopile',
|
|
'wodór zielony', 'hydrogen', 'magazyn energii',
|
|
'port instalacyjny', 'hub logistyczny', 'stocznia',
|
|
'psew', 'offshore wind poland', 'windeurope', 'forum energii',
|
|
'energa inwestycje'
|
|
]
|
|
|
|
ZOPK_LOCATIONS = [
|
|
# Lokalizacje które wzmacniają słabe keywords
|
|
'kaszuby', 'kaszubia', 'pomorze', 'pomorskie',
|
|
'wejherowo', 'rumia', 'gdynia', 'gdańsk', 'reda', 'puck',
|
|
'choczewo', 'lubiatowo', 'kopalino', 'żarnowiec', 'krokowa',
|
|
'bałtyk', 'baltyk', 'morze bałtyckie',
|
|
'trójmiasto', 'trojmiasto'
|
|
]
|
|
|
|
# Pełna lista (dla kompatybilności wstecznej)
|
|
ZOPK_KEYWORDS = ZOPK_KEYWORDS_CRITICAL + ZOPK_KEYWORDS_STRONG + ZOPK_KEYWORDS_WEAK
|
|
|
|
|
|
@dataclass
|
|
class NewsItem:
|
|
"""Represents a news item from any source"""
|
|
title: str
|
|
url: str
|
|
description: str
|
|
source_name: str
|
|
source_type: str # brave, google_news, rss_local
|
|
source_id: str # specific source identifier
|
|
published_at: Optional[datetime]
|
|
image_url: Optional[str] = None
|
|
|
|
@property
|
|
def url_hash(self) -> str:
|
|
"""SHA256 hash of URL for exact deduplication"""
|
|
return hashlib.sha256(self.url.encode()).hexdigest()
|
|
|
|
@property
|
|
def title_hash(self) -> str:
|
|
"""Normalized title hash for fuzzy matching"""
|
|
return normalize_title_hash(self.title)
|
|
|
|
@property
|
|
def domain(self) -> str:
|
|
"""Extract domain from URL"""
|
|
parsed = urlparse(self.url)
|
|
return parsed.netloc.replace('www.', '')
|
|
|
|
|
|
def normalize_title_hash(title: str) -> str:
|
|
"""
|
|
Create a normalized hash from title for fuzzy matching.
|
|
|
|
Normalization:
|
|
- Lowercase
|
|
- Remove diacritics (ą→a, ę→e, etc.)
|
|
- Remove punctuation
|
|
- Remove common words (i, w, z, na, do, etc.)
|
|
- Sort words alphabetically
|
|
- Hash the result
|
|
"""
|
|
if not title:
|
|
return ''
|
|
|
|
# Lowercase
|
|
text = title.lower()
|
|
|
|
# Remove diacritics
|
|
text = unicodedata.normalize('NFKD', text)
|
|
text = ''.join(c for c in text if not unicodedata.combining(c))
|
|
|
|
# Remove punctuation
|
|
text = re.sub(r'[^\w\s]', '', text)
|
|
|
|
# Remove common Polish stop words
|
|
stop_words = {'i', 'w', 'z', 'na', 'do', 'o', 'od', 'za', 'po', 'przy', 'dla', 'oraz', 'sie', 'to', 'jest', 'ze', 'nie', 'jak', 'czy', 'ale', 'a'}
|
|
words = [w for w in text.split() if w not in stop_words and len(w) > 2]
|
|
|
|
# Sort and join
|
|
text = ' '.join(sorted(words))
|
|
|
|
# Hash
|
|
return hashlib.sha256(text.encode()).hexdigest()[:32]
|
|
|
|
|
|
def is_blacklisted_domain(domain: str) -> bool:
|
|
"""Check if domain is on the blacklist"""
|
|
domain = domain.lower().replace('www.', '')
|
|
return domain in BLACKLISTED_DOMAINS
|
|
|
|
|
|
def get_domain_bonus(domain: str) -> int:
|
|
"""Get bonus score for preferred domains"""
|
|
domain = domain.lower().replace('www.', '')
|
|
# Check exact match
|
|
if domain in PREFERRED_DOMAINS:
|
|
return PREFERRED_DOMAINS[domain]
|
|
# Check if domain ends with preferred (e.g., biznes.trojmiasto.pl)
|
|
for pref_domain, bonus in PREFERRED_DOMAINS.items():
|
|
if domain.endswith(pref_domain):
|
|
return bonus
|
|
return 0
|
|
|
|
|
|
def calculate_keyword_score(title: str, description: str = '') -> dict:
|
|
"""
|
|
Calculate keyword relevance score.
|
|
|
|
Returns:
|
|
dict with:
|
|
- score: 0-5 (0 = no match, 5 = critical keyword)
|
|
- matches: list of matched keywords
|
|
- reason: explanation
|
|
"""
|
|
text = f"{title} {description}".lower()
|
|
|
|
matches = {
|
|
'critical': [],
|
|
'strong': [],
|
|
'weak': [],
|
|
'locations': []
|
|
}
|
|
|
|
# Check critical keywords (instant high score)
|
|
for kw in ZOPK_KEYWORDS_CRITICAL:
|
|
if kw.lower() in text:
|
|
matches['critical'].append(kw)
|
|
|
|
# Check strong keywords
|
|
for kw in ZOPK_KEYWORDS_STRONG:
|
|
if kw.lower() in text:
|
|
matches['strong'].append(kw)
|
|
|
|
# Check weak keywords
|
|
for kw in ZOPK_KEYWORDS_WEAK:
|
|
if kw.lower() in text:
|
|
matches['weak'].append(kw)
|
|
|
|
# Check locations
|
|
for loc in ZOPK_LOCATIONS:
|
|
if loc.lower() in text:
|
|
matches['locations'].append(loc)
|
|
|
|
# Calculate score
|
|
if matches['critical']:
|
|
score = 5
|
|
reason = f"Trafienie krytyczne: {matches['critical'][0]}"
|
|
elif matches['strong']:
|
|
score = 4
|
|
reason = f"Mocne powiązanie: {matches['strong'][0]}"
|
|
elif matches['weak'] and matches['locations']:
|
|
# Weak keyword + location = medium score
|
|
score = 3
|
|
reason = f"Słabe + lokalizacja: {matches['weak'][0]} + {matches['locations'][0]}"
|
|
elif len(matches['weak']) >= 2:
|
|
# Multiple weak keywords = medium score
|
|
score = 3
|
|
reason = f"Wiele słabych: {', '.join(matches['weak'][:2])}"
|
|
elif matches['weak']:
|
|
# Single weak keyword = low score
|
|
score = 2
|
|
reason = f"Tylko słabe: {matches['weak'][0]}"
|
|
elif matches['locations']:
|
|
# Only location, no industry keywords
|
|
score = 1
|
|
reason = f"Tylko lokalizacja: {matches['locations'][0]}"
|
|
else:
|
|
score = 0
|
|
reason = "Brak trafień słów kluczowych"
|
|
|
|
return {
|
|
'score': score,
|
|
'matches': matches,
|
|
'reason': reason,
|
|
'total_matches': sum(len(v) for v in matches.values())
|
|
}
|
|
|
|
|
|
def is_zopk_relevant(title: str, description: str = '') -> bool:
|
|
"""Check if content is relevant to ZOPK topics (legacy compatibility)"""
|
|
result = calculate_keyword_score(title, description)
|
|
return result['score'] >= 3
|
|
|
|
|
|
class ZOPKNewsService:
|
|
"""
|
|
Multi-source news search service with cross-verification and AI pre-filtering.
|
|
|
|
NOWY PIPELINE (2026-01):
|
|
1. Wyszukiwanie: wiele precyzyjnych zapytań Brave + RSS
|
|
2. Pre-filtrowanie: blacklista domen + słowa kluczowe
|
|
3. Ocena AI: PRZED zapisem do bazy (tylko 3+★)
|
|
4. Zapis: tylko wysokiej jakości artykuły
|
|
"""
|
|
|
|
def __init__(self, db_session, brave_api_key: Optional[str] = None, enable_ai_prefilter: bool = True):
|
|
self.db = db_session
|
|
self.brave_api_key = brave_api_key or os.getenv('BRAVE_API_KEY')
|
|
self.enable_ai_prefilter = enable_ai_prefilter
|
|
self._gemini_service = None
|
|
|
|
def _get_gemini(self):
|
|
"""Lazy load Gemini service"""
|
|
if self._gemini_service is None:
|
|
try:
|
|
from gemini_service import get_gemini_service
|
|
self._gemini_service = get_gemini_service()
|
|
except Exception as e:
|
|
logger.error(f"Failed to load Gemini: {e}")
|
|
return self._gemini_service
|
|
|
|
def search_all_sources(self, query: str = None, user_id: int = None, progress_callback=None) -> Dict:
|
|
"""
|
|
Search all sources with IMPROVED PIPELINE:
|
|
1. Multiple precise Brave queries
|
|
2. Pre-filter by domain blacklist and keywords
|
|
3. AI evaluation BEFORE saving (reject 1-2★)
|
|
4. Save only quality items (3+★)
|
|
|
|
Args:
|
|
query: Deprecated, ignored. Uses BRAVE_QUERIES instead.
|
|
user_id: User ID for tracking AI usage
|
|
progress_callback: Optional callback(phase, message, current, total) for SSE streaming
|
|
|
|
Returns:
|
|
Dict with search results, statistics, and detailed process log
|
|
"""
|
|
import time
|
|
start_time = time.time()
|
|
|
|
all_items: List[NewsItem] = []
|
|
source_stats = {
|
|
'brave_queries': 0,
|
|
'brave_results': 0,
|
|
'rss_results': 0,
|
|
'blacklisted': 0,
|
|
'keyword_filtered': 0,
|
|
'ai_rejected': 0,
|
|
'ai_approved': 0
|
|
}
|
|
|
|
# Process log for frontend progress display
|
|
process_log = []
|
|
auto_approved_articles = [] # Track articles auto-approved (3+★)
|
|
ai_rejected_articles = [] # Track articles rejected by AI (1-2★)
|
|
|
|
# 1. BRAVE SEARCH - Multiple precise queries
|
|
process_log.append({
|
|
'phase': 'search',
|
|
'step': 'brave_start',
|
|
'message': f'Rozpoczynam wyszukiwanie Brave ({len(BRAVE_QUERIES)} zapytań)...',
|
|
'count': len(BRAVE_QUERIES)
|
|
})
|
|
|
|
if self.brave_api_key:
|
|
for i, query_config in enumerate(BRAVE_QUERIES):
|
|
brave_items = self._search_brave_single(query_config['query'])
|
|
source_stats['brave_queries'] += 1
|
|
source_stats['brave_results'] += len(brave_items)
|
|
all_items.extend(brave_items)
|
|
logger.info(f"Brave '{query_config['description']}': {len(brave_items)} items")
|
|
|
|
process_log.append({
|
|
'phase': 'search',
|
|
'step': f'brave_{i+1}',
|
|
'message': f"Brave: {query_config['description']}",
|
|
'count': len(brave_items)
|
|
})
|
|
|
|
if progress_callback:
|
|
progress_callback('search', f"Brave: {query_config['description']} ({len(brave_items)})", i + 1, len(BRAVE_QUERIES))
|
|
|
|
# Rate limit: Brave free tier ~1 req/s
|
|
if i < len(BRAVE_QUERIES) - 1:
|
|
time.sleep(1.1)
|
|
else:
|
|
process_log.append({
|
|
'phase': 'search',
|
|
'step': 'brave_skip',
|
|
'message': 'Brave API niedostępne - pominięto',
|
|
'count': 0
|
|
})
|
|
|
|
process_log.append({
|
|
'phase': 'search',
|
|
'step': 'brave_done',
|
|
'message': f'Brave: znaleziono {source_stats["brave_results"]} artykułów',
|
|
'count': source_stats['brave_results']
|
|
})
|
|
|
|
# 2. RSS Feeds
|
|
process_log.append({
|
|
'phase': 'search',
|
|
'step': 'rss_start',
|
|
'message': f'Przeszukuję {len(RSS_SOURCES)} źródeł RSS...',
|
|
'count': len(RSS_SOURCES)
|
|
})
|
|
|
|
for source_id, source_config in RSS_SOURCES.items():
|
|
rss_items = self._fetch_rss(source_id, source_config)
|
|
all_items.extend(rss_items)
|
|
source_stats['rss_results'] += len(rss_items)
|
|
|
|
if rss_items:
|
|
process_log.append({
|
|
'phase': 'search',
|
|
'step': f'rss_{source_id}',
|
|
'message': f"RSS: {source_config['name']}",
|
|
'count': len(rss_items)
|
|
})
|
|
|
|
process_log.append({
|
|
'phase': 'search',
|
|
'step': 'rss_done',
|
|
'message': f'RSS: znaleziono {source_stats["rss_results"]} artykułów',
|
|
'count': source_stats['rss_results']
|
|
})
|
|
|
|
if progress_callback:
|
|
progress_callback('search', f'RSS: {source_stats["rss_results"]} wyników', 0, 0)
|
|
|
|
logger.info(f"Total raw items: {len(all_items)}")
|
|
|
|
total_raw = len(all_items)
|
|
process_log.append({
|
|
'phase': 'search',
|
|
'step': 'search_complete',
|
|
'message': f'📥 Łącznie pobrano: {total_raw} artykułów',
|
|
'count': total_raw
|
|
})
|
|
|
|
# 3. PRE-FILTER: Domain blacklist
|
|
process_log.append({
|
|
'phase': 'filter',
|
|
'step': 'blacklist_start',
|
|
'message': 'Filtrowanie: sprawdzam blacklistę domen...',
|
|
'count': 0
|
|
})
|
|
|
|
filtered_items = []
|
|
blacklisted_domains_found = set()
|
|
for item in all_items:
|
|
if is_blacklisted_domain(item.domain):
|
|
source_stats['blacklisted'] += 1
|
|
blacklisted_domains_found.add(item.domain)
|
|
logger.debug(f"Blacklisted domain: {item.domain}")
|
|
continue
|
|
filtered_items.append(item)
|
|
|
|
logger.info(f"After blacklist filter: {len(filtered_items)} (removed {source_stats['blacklisted']})")
|
|
|
|
process_log.append({
|
|
'phase': 'filter',
|
|
'step': 'blacklist_done',
|
|
'message': f'🚫 Blacklist: usunięto {source_stats["blacklisted"]} artykułów (sport, plotki, lifestyle)',
|
|
'count': source_stats['blacklisted']
|
|
})
|
|
|
|
# 4. PRE-FILTER: Keyword score (minimum 2)
|
|
process_log.append({
|
|
'phase': 'filter',
|
|
'step': 'keywords_start',
|
|
'message': 'Filtrowanie: analiza słów kluczowych ZOPK...',
|
|
'count': 0
|
|
})
|
|
|
|
keyword_filtered = []
|
|
for item in filtered_items:
|
|
kw_result = calculate_keyword_score(item.title, item.description)
|
|
if kw_result['score'] >= 2: # At least weak relevance
|
|
item.keyword_score = kw_result['score']
|
|
item.keyword_reason = kw_result['reason']
|
|
keyword_filtered.append(item)
|
|
else:
|
|
source_stats['keyword_filtered'] += 1
|
|
|
|
logger.info(f"After keyword filter: {len(keyword_filtered)} (removed {source_stats['keyword_filtered']})")
|
|
|
|
process_log.append({
|
|
'phase': 'filter',
|
|
'step': 'keywords_done',
|
|
'message': f'🔑 Keywords: usunięto {source_stats["keyword_filtered"]} (brak słów kluczowych ZOPK)',
|
|
'count': source_stats['keyword_filtered']
|
|
})
|
|
|
|
process_log.append({
|
|
'phase': 'filter',
|
|
'step': 'filter_complete',
|
|
'message': f'✅ Po filtrowaniu: {len(keyword_filtered)} artykułów do analizy AI',
|
|
'count': len(keyword_filtered)
|
|
})
|
|
|
|
# 5. Cross-verify and deduplicate
|
|
verified_items = self._cross_verify(keyword_filtered)
|
|
logger.info(f"After deduplication: {len(verified_items)} unique items")
|
|
|
|
process_log.append({
|
|
'phase': 'filter',
|
|
'step': 'dedup_done',
|
|
'message': f'🔄 Deduplikacja: {len(verified_items)} unikalnych artykułów',
|
|
'count': len(verified_items)
|
|
})
|
|
|
|
if progress_callback:
|
|
progress_callback('filter', f'Po filtrach: {len(verified_items)} artykułów do AI', 0, 0)
|
|
|
|
# 6. AI EVALUATION (before saving) - only if enabled
|
|
sent_to_ai = len(verified_items) # Track before AI modifies the list
|
|
|
|
if self.enable_ai_prefilter and self._get_gemini():
|
|
process_log.append({
|
|
'phase': 'ai',
|
|
'step': 'ai_start',
|
|
'message': f'🤖 AI (Gemini): rozpoczynam ocenę {len(verified_items)} artykułów...',
|
|
'count': len(verified_items)
|
|
})
|
|
|
|
ai_approved = []
|
|
ai_evaluated_count = 0
|
|
|
|
for item in verified_items:
|
|
ai_result = evaluate_news_relevance(
|
|
{
|
|
'title': item['title'],
|
|
'description': item['description'],
|
|
'source_name': item['source_name'],
|
|
'published_at': item.get('published_at')
|
|
},
|
|
self._get_gemini(),
|
|
user_id=user_id
|
|
)
|
|
|
|
ai_evaluated_count += 1
|
|
|
|
if progress_callback:
|
|
progress_callback('ai', f'AI ocenia: {item["title"][:50]}...', ai_evaluated_count, sent_to_ai)
|
|
|
|
if ai_result.get('evaluated'):
|
|
ai_score = ai_result.get('score', 0)
|
|
if ai_score >= 3:
|
|
# Good score - save it
|
|
item['ai_score'] = ai_score
|
|
item['ai_reason'] = ai_result.get('reason', '')
|
|
item['ai_relevant'] = True
|
|
ai_approved.append(item)
|
|
source_stats['ai_approved'] += 1
|
|
|
|
# Track for frontend display
|
|
auto_approved_articles.append({
|
|
'title': item['title'][:80] + ('...' if len(item['title']) > 80 else ''),
|
|
'score': ai_score,
|
|
'source': item.get('source_name', item.get('source_domain', ''))
|
|
})
|
|
|
|
logger.debug(f"AI approved ({ai_score}★): {item['title'][:50]}")
|
|
else:
|
|
# Low score - reject before saving
|
|
source_stats['ai_rejected'] += 1
|
|
ai_rejected_articles.append({
|
|
'title': item['title'][:80] + ('...' if len(item['title']) > 80 else ''),
|
|
'score': ai_score,
|
|
'source': item.get('source_name', item.get('source_domain', ''))
|
|
})
|
|
logger.debug(f"AI rejected ({ai_score}★): {item['title'][:50]}")
|
|
else:
|
|
# AI evaluation failed - save as pending for manual review
|
|
item['ai_score'] = None
|
|
item['ai_reason'] = ai_result.get('reason', 'AI evaluation failed')
|
|
item['ai_relevant'] = None
|
|
ai_approved.append(item)
|
|
source_stats['ai_approved'] += 1
|
|
|
|
verified_items = ai_approved
|
|
logger.info(f"After AI filter: {len(verified_items)} approved, {source_stats['ai_rejected']} rejected")
|
|
|
|
process_log.append({
|
|
'phase': 'ai',
|
|
'step': 'ai_done',
|
|
'message': f'🤖 AI: oceniono {ai_evaluated_count}, zaakceptowano {source_stats["ai_approved"]} (3+★), odrzucono {source_stats["ai_rejected"]}',
|
|
'count': source_stats['ai_approved']
|
|
})
|
|
else:
|
|
logger.info("AI pre-filter disabled or Gemini unavailable")
|
|
process_log.append({
|
|
'phase': 'ai',
|
|
'step': 'ai_skip',
|
|
'message': '🤖 AI: wyłączony lub niedostępny',
|
|
'count': 0
|
|
})
|
|
|
|
# 7. Save to database (only quality items)
|
|
process_log.append({
|
|
'phase': 'save',
|
|
'step': 'save_start',
|
|
'message': f'💾 Zapisuję {len(verified_items)} artykułów do bazy...',
|
|
'count': len(verified_items)
|
|
})
|
|
|
|
saved_count, updated_count = self._save_to_database(verified_items)
|
|
|
|
process_log.append({
|
|
'phase': 'save',
|
|
'step': 'save_done',
|
|
'message': f'💾 Zapisano: {saved_count} nowych, {updated_count} zaktualizowanych',
|
|
'count': saved_count + updated_count
|
|
})
|
|
|
|
if progress_callback:
|
|
progress_callback('save', f'Zapisano {saved_count} nowych artykułów', saved_count, saved_count)
|
|
|
|
# Final summary
|
|
# Note: score >= 3 triggers auto-approve (verified 2026-01-15)
|
|
auto_approved_count = sum(1 for item in verified_items if item.get('auto_approve', False) or (item.get('ai_score') and item['ai_score'] >= 3))
|
|
|
|
process_log.append({
|
|
'phase': 'complete',
|
|
'step': 'done',
|
|
'message': f'✅ Zakończono! {saved_count} nowych artykułów w bazie wiedzy.',
|
|
'count': saved_count
|
|
})
|
|
|
|
processing_time = time.time() - start_time
|
|
|
|
return {
|
|
'total_found': source_stats['brave_results'] + source_stats['rss_results'],
|
|
'blacklisted': source_stats['blacklisted'],
|
|
'keyword_filtered': source_stats['keyword_filtered'],
|
|
'sent_to_ai': sent_to_ai,
|
|
'ai_rejected': source_stats['ai_rejected'],
|
|
'ai_approved': source_stats['ai_approved'],
|
|
'unique_items': len(verified_items),
|
|
'saved_new': saved_count,
|
|
'updated_existing': updated_count,
|
|
'duplicates': updated_count, # Updated = duplicates that existed
|
|
'source_stats': source_stats,
|
|
'auto_approved': auto_approved_count,
|
|
'process_log': process_log,
|
|
'auto_approved_articles': auto_approved_articles,
|
|
'ai_rejected_articles': ai_rejected_articles,
|
|
'processing_time': processing_time,
|
|
'knowledge_entities_created': saved_count # Same as saved_new for now
|
|
}
|
|
|
|
@staticmethod
|
|
def _decode_brave_image_url(proxy_url: Optional[str]) -> Optional[str]:
|
|
"""Decode Brave Search proxy image URL to original source URL.
|
|
|
|
Brave proxy URLs encode the original URL as base64 after '/g:ce/'.
|
|
Example: https://imgs.search.brave.com/.../g:ce/aHR0cHM6Ly9... → https://...
|
|
"""
|
|
if not proxy_url or 'imgs.search.brave.com' not in proxy_url:
|
|
return proxy_url
|
|
try:
|
|
# Extract base64 part after /g:ce/
|
|
match = re.search(r'/g:ce/(.+)$', proxy_url)
|
|
if not match:
|
|
return proxy_url
|
|
encoded = match.group(1)
|
|
# Brave uses URL-safe base64 with path separators as line breaks
|
|
encoded = encoded.replace('/', '')
|
|
# Add padding
|
|
padding = 4 - len(encoded) % 4
|
|
if padding != 4:
|
|
encoded += '=' * padding
|
|
decoded = base64.urlsafe_b64decode(encoded).decode('utf-8', errors='ignore')
|
|
if decoded.startswith('http'):
|
|
return decoded
|
|
except Exception:
|
|
pass
|
|
return proxy_url
|
|
|
|
def _search_brave_single(self, query: str) -> List[NewsItem]:
|
|
"""Search Brave API with a single query, with retry on 429"""
|
|
if not self.brave_api_key:
|
|
return []
|
|
|
|
items = []
|
|
max_retries = 2
|
|
|
|
for attempt in range(max_retries + 1):
|
|
try:
|
|
headers = {
|
|
'Accept': 'application/json',
|
|
'X-Subscription-Token': self.brave_api_key
|
|
}
|
|
params = {
|
|
'q': query,
|
|
'count': 10,
|
|
'freshness': 'pw',
|
|
'country': 'pl',
|
|
'search_lang': 'pl'
|
|
}
|
|
|
|
response = requests.get(
|
|
'https://api.search.brave.com/res/v1/news/search',
|
|
headers=headers,
|
|
params=params,
|
|
timeout=30
|
|
)
|
|
|
|
if response.status_code == 200:
|
|
results = response.json().get('results', [])
|
|
for item in results:
|
|
if item.get('url'):
|
|
items.append(NewsItem(
|
|
title=item.get('title', 'Bez tytułu'),
|
|
url=item['url'],
|
|
description=item.get('description', ''),
|
|
source_name=item.get('source', ''),
|
|
source_type='brave',
|
|
source_id=f'brave_{query[:20]}',
|
|
published_at=datetime.now(),
|
|
image_url=self._decode_brave_image_url(item.get('thumbnail', {}).get('src'))
|
|
))
|
|
break # success
|
|
elif response.status_code == 429:
|
|
if attempt < max_retries:
|
|
wait = 1.5 * (2 ** attempt) # 1.5s, 3s
|
|
logger.warning(f"Brave API 429 for '{query[:30]}', retry {attempt+1} after {wait}s")
|
|
time.sleep(wait)
|
|
else:
|
|
logger.error(f"Brave API 429 for '{query[:30]}' after {max_retries} retries, skipping")
|
|
else:
|
|
logger.error(f"Brave API error for '{query[:30]}': {response.status_code}")
|
|
break
|
|
|
|
except Exception as e:
|
|
logger.error(f"Brave search error: {e}")
|
|
break
|
|
|
|
return items
|
|
|
|
def _search_brave(self, query: str) -> List[NewsItem]:
|
|
"""Legacy method - redirects to new multi-query approach"""
|
|
# Kept for compatibility, but now uses multiple queries
|
|
all_items = []
|
|
for query_config in BRAVE_QUERIES:
|
|
all_items.extend(self._search_brave_single(query_config['query']))
|
|
return all_items
|
|
|
|
def _fetch_rss(self, source_id: str, config: Dict) -> List[NewsItem]:
|
|
"""Fetch and parse RSS feed"""
|
|
items = []
|
|
try:
|
|
feed = feedparser.parse(config['url'])
|
|
|
|
for entry in feed.entries[:30]: # Limit to 30 per feed
|
|
title = entry.get('title', '')
|
|
description = entry.get('summary', entry.get('description', ''))
|
|
|
|
# Filter by keywords if specified
|
|
keywords = config.get('keywords', [])
|
|
if keywords and not any(kw in f"{title} {description}".lower() for kw in keywords):
|
|
continue
|
|
|
|
# Check ZOPK relevance for local media
|
|
if config['type'] == 'local_media' and not is_zopk_relevant(title, description):
|
|
continue
|
|
|
|
# Parse date
|
|
published_at = None
|
|
if hasattr(entry, 'published_parsed') and entry.published_parsed:
|
|
published_at = datetime(*entry.published_parsed[:6])
|
|
|
|
items.append(NewsItem(
|
|
title=title,
|
|
url=entry.get('link', ''),
|
|
description=description[:500],
|
|
source_name=config['name'],
|
|
source_type='rss_' + config['type'],
|
|
source_id=source_id,
|
|
published_at=published_at,
|
|
image_url=self._extract_image_from_entry(entry)
|
|
))
|
|
|
|
except Exception as e:
|
|
logger.error(f"RSS fetch error for {source_id}: {e}")
|
|
|
|
return items
|
|
|
|
def _extract_image_from_entry(self, entry) -> Optional[str]:
|
|
"""Extract image URL from RSS entry"""
|
|
# Try media:thumbnail
|
|
if hasattr(entry, 'media_thumbnail') and entry.media_thumbnail:
|
|
return entry.media_thumbnail[0].get('url')
|
|
|
|
# Try media:content
|
|
if hasattr(entry, 'media_content') and entry.media_content:
|
|
for media in entry.media_content:
|
|
if media.get('type', '').startswith('image/'):
|
|
return media.get('url')
|
|
|
|
# Try enclosure
|
|
if hasattr(entry, 'enclosures') and entry.enclosures:
|
|
for enc in entry.enclosures:
|
|
if enc.get('type', '').startswith('image/'):
|
|
return enc.get('href')
|
|
|
|
return None
|
|
|
|
def _cross_verify(self, items: List[NewsItem]) -> List[Dict]:
|
|
"""
|
|
Cross-verify items from multiple sources.
|
|
|
|
Groups items by title_hash to find the same story from different sources.
|
|
Increases confidence_score based on number of sources.
|
|
"""
|
|
# Group by title_hash (fuzzy match)
|
|
title_groups: Dict[str, List[NewsItem]] = {}
|
|
for item in items:
|
|
title_hash = item.title_hash
|
|
if title_hash not in title_groups:
|
|
title_groups[title_hash] = []
|
|
title_groups[title_hash].append(item)
|
|
|
|
# Also track URL hashes to avoid exact duplicates
|
|
seen_urls = set()
|
|
verified_items = []
|
|
|
|
for title_hash, group in title_groups.items():
|
|
# Get unique sources
|
|
unique_sources = list(set(item.source_id for item in group))
|
|
source_count = len(unique_sources)
|
|
|
|
# Use the first item as base (prefer Brave for better metadata)
|
|
base_item = sorted(group, key=lambda x: x.source_type != 'brave')[0]
|
|
|
|
if base_item.url_hash in seen_urls:
|
|
continue
|
|
seen_urls.add(base_item.url_hash)
|
|
|
|
# Calculate confidence
|
|
confidence_score = min(5, source_count + 1) # 1-5 scale
|
|
auto_approve = source_count >= 3
|
|
|
|
verified_items.append({
|
|
'title': base_item.title,
|
|
'url': base_item.url,
|
|
'url_hash': base_item.url_hash,
|
|
'title_hash': title_hash,
|
|
'description': base_item.description,
|
|
'source_name': base_item.source_name,
|
|
'source_domain': base_item.domain,
|
|
'source_type': base_item.source_type,
|
|
'published_at': base_item.published_at,
|
|
'image_url': base_item.image_url,
|
|
'confidence_score': confidence_score,
|
|
'source_count': source_count,
|
|
'sources_list': unique_sources,
|
|
'auto_approve': auto_approve
|
|
})
|
|
|
|
return verified_items
|
|
|
|
# Keywords that indicate nuclear/PEJ content — used for auto-tagging project_id
|
|
NUCLEAR_KEYWORDS = [
|
|
'elektrowni', 'jądrow', 'atomow', 'pej ', ' pej',
|
|
'westinghouse', 'bechtel', 'ap1000', 'ap 1000',
|
|
'lubiatowo', 'kopalino', 'choczewo',
|
|
'reaktor jądr', 'smr ', 'bwrx', 'ge hitachi',
|
|
'orlen synthos', 'green energy',
|
|
]
|
|
|
|
def _detect_nuclear_project_id(self, title: str, description: str = '') -> int:
|
|
"""Return nuclear project_id if content matches nuclear keywords, else None."""
|
|
text = (title + ' ' + (description or '')).lower()
|
|
for kw in self.NUCLEAR_KEYWORDS:
|
|
if kw in text:
|
|
if not hasattr(self, '_nuclear_project_id'):
|
|
from database import ZOPKProject
|
|
proj = self.db.query(ZOPKProject.id).filter(
|
|
ZOPKProject.slug == 'nuclear-plant'
|
|
).first()
|
|
self._nuclear_project_id = proj[0] if proj else None
|
|
return self._nuclear_project_id
|
|
return None
|
|
|
|
def _save_to_database(self, items: List[Dict]) -> Tuple[int, int]:
|
|
"""
|
|
Save verified items to database.
|
|
|
|
Returns:
|
|
Tuple of (new_count, updated_count)
|
|
"""
|
|
from database import ZOPKNews
|
|
|
|
new_count = 0
|
|
updated_count = 0
|
|
|
|
for item in items:
|
|
# Check if URL already exists
|
|
existing = self.db.query(ZOPKNews).filter(
|
|
ZOPKNews.url_hash == item['url_hash']
|
|
).first()
|
|
|
|
if existing:
|
|
# Update source count and confidence if new sources found
|
|
existing_sources = existing.sources_list or []
|
|
new_sources = [s for s in item['sources_list'] if s not in existing_sources]
|
|
|
|
if new_sources:
|
|
existing.sources_list = existing_sources + new_sources
|
|
existing.source_count = len(existing.sources_list)
|
|
existing.confidence_score = min(5, existing.source_count + 1)
|
|
|
|
# Auto-approve if threshold reached
|
|
if existing.source_count >= 3 and existing.status == 'pending':
|
|
existing.status = 'auto_approved'
|
|
existing.is_auto_verified = True
|
|
|
|
updated_count += 1
|
|
else:
|
|
# Create new entry
|
|
# Determine status based on AI score (if available)
|
|
# Note: score >= 3 triggers auto-approve (verified 2026-01-15)
|
|
ai_score = item.get('ai_score')
|
|
if ai_score and ai_score >= 3:
|
|
status = 'auto_approved' # AI score 3+ = auto-approve
|
|
elif item.get('auto_approve'):
|
|
status = 'auto_approved' # Multiple sources = auto-approve
|
|
else:
|
|
status = 'pending'
|
|
|
|
# Auto-detect project (nuclear, etc.)
|
|
detected_project_id = self._detect_nuclear_project_id(
|
|
item['title'], item.get('description', '')
|
|
)
|
|
|
|
news = ZOPKNews(
|
|
title=item['title'],
|
|
url=item['url'],
|
|
url_hash=item['url_hash'],
|
|
title_hash=item['title_hash'],
|
|
description=item['description'],
|
|
project_id=detected_project_id,
|
|
source_name=item['source_name'],
|
|
source_domain=item['source_domain'],
|
|
source_type=item['source_type'],
|
|
published_at=item['published_at'],
|
|
image_url=item.get('image_url'),
|
|
confidence_score=item['confidence_score'],
|
|
source_count=item['source_count'],
|
|
sources_list=item['sources_list'],
|
|
is_auto_verified=item.get('auto_approve', False) or (ai_score and ai_score >= 3),
|
|
status=status,
|
|
# AI evaluation results from pre-filtering
|
|
ai_relevant=item.get('ai_relevant'),
|
|
ai_relevance_score=ai_score,
|
|
ai_evaluation_reason=item.get('ai_reason', '')[:255] if item.get('ai_reason') else None,
|
|
ai_evaluated_at=datetime.now() if ai_score else None,
|
|
ai_model='gemini-3-flash-preview' if ai_score else None
|
|
)
|
|
self.db.add(news)
|
|
new_count += 1
|
|
|
|
self.db.commit()
|
|
return new_count, updated_count
|
|
|
|
|
|
def search_zopk_news(db_session, query: str = None, user_id: int = None, enable_ai_prefilter: bool = True) -> Dict:
|
|
"""
|
|
Convenience function to search ZOPK news from all sources.
|
|
|
|
NOWY PIPELINE (2026-01):
|
|
- 8 precyzyjnych zapytań Brave (zamiast 1 ogólnego)
|
|
- Blacklista domen (sport, plotki, lifestyle)
|
|
- Pre-filtrowanie po słowach kluczowych (min. score 2)
|
|
- Ocena AI PRZED zapisem (odrzuca 1-2★)
|
|
- Tylko artykuły 3+★ lądują w bazie
|
|
|
|
Args:
|
|
db_session: SQLAlchemy session
|
|
query: Deprecated, ignored
|
|
user_id: User ID for tracking AI usage
|
|
enable_ai_prefilter: If True, evaluate with AI before saving (default: True)
|
|
|
|
Usage:
|
|
from zopk_news_service import search_zopk_news
|
|
results = search_zopk_news(db, user_id=current_user.id)
|
|
"""
|
|
service = ZOPKNewsService(db_session, enable_ai_prefilter=enable_ai_prefilter)
|
|
return service.search_all_sources(user_id=user_id)
|
|
|
|
|
|
# ============================================================
|
|
# AI RELEVANCE EVALUATION (GEMINI)
|
|
# ============================================================
|
|
|
|
ZOPK_AI_EVALUATION_PROMPT = """Jesteś ekspertem ds. analizy wiadomości. Oceń, czy poniższy artykuł/news dotyczy projektu **Zielony Okręg Przemysłowy Kaszubia (ZOPK)** lub związanych z nim tematów.
|
|
|
|
**ZOPK obejmuje:**
|
|
1. Morską energetykę wiatrową na Bałtyku (offshore wind, Baltic Power, Baltica, Bałtyk 1/2/3)
|
|
2. Elektrownię jądrową w Lubiatowie-Kopalino (Choczewo, PEJ, atom Pomorze)
|
|
3. Inwestycję Kongsberg w Rumi (przemysł obronny, zbrojeniówka Pomorze)
|
|
4. Centra danych i laboratoria wodorowe
|
|
5. Rozwój przemysłowy Kaszub (Wejherowo, Rumia, Gdynia, Kościerzyna)
|
|
6. Kluczowe osoby: Maciej Samsonowicz (koordynator ZOPK), minister Kosiniak-Kamysz
|
|
|
|
**KLUCZOWE PROJEKTY INFRASTRUKTURALNE (5★ gdy w tytule!):**
|
|
7. Via Pomerania - droga ekspresowa Ustka-Bydgoszcz (każda wzmianka = 5★)
|
|
8. S6 Koszalin-Słupsk - autostrada Pomorze (każda wzmianka = 4-5★)
|
|
9. Droga Czerwona - połączenie z Portem Gdynia (każda wzmianka = 4-5★)
|
|
10. Pakt dla Bezpieczeństwa Polski - Pomorze Środkowe (każda wzmianka = 5★)
|
|
11. Deklaracja Bałtycka - bezpieczeństwo Bałtyku (każda wzmianka = 4-5★)
|
|
|
|
**LOKALNE ORGANIZACJE BIZNESOWE (4-5★):**
|
|
12. Izba Przedsiębiorców NORDA, Norda Biznes, Akademia Biznesu NORDA
|
|
13. Regionalne izby gospodarcze Pomorza (Wejherowo, Gdynia, Rumia)
|
|
|
|
**Artykuł do oceny:**
|
|
Tytuł: {title}
|
|
Opis: {description}
|
|
Źródło: {source}
|
|
Data: {date}
|
|
|
|
**Twoje zadanie:**
|
|
1. Oceń czy artykuł dotyczy ZOPK lub powiązanych tematów
|
|
2. Przyznaj ocenę od 1 do 5 gwiazdek:
|
|
- ⭐ 1 = Bardzo słabo powiązany (luźna styczność z regionem/przemysłem)
|
|
- ⭐⭐ 2 = Słabo powiązany (ogólne wiadomości branżowe)
|
|
- ⭐⭐⭐ 3 = Średnio powiązany (dotyczy branży ZOPK, ale nie bezpośrednio projektu)
|
|
- ⭐⭐⭐⭐ 4 = Mocno powiązany (bezpośrednio dotyczy inwestycji lub kluczowych firm ZOPK)
|
|
- ⭐⭐⭐⭐⭐ 5 = Doskonale pasuje (główny temat to ZOPK, Kongsberg, offshore Baltic, elektrownia Choczewo)
|
|
|
|
3. Odpowiedz TYLKO w formacie JSON (bez żadnego innego tekstu):
|
|
|
|
{{"relevant": true/false, "score": 1-5, "reason": "krótkie uzasadnienie po polsku (max 100 znaków)"}}
|
|
|
|
Zasady:
|
|
- relevant=true gdy score >= 3
|
|
- relevant=false gdy score < 3
|
|
|
|
Przykłady odpowiedzi:
|
|
{{"relevant": true, "score": 5, "reason": "Bezpośrednio o Via Pomerania"}}
|
|
{{"relevant": true, "score": 5, "reason": "Pakt Bezpieczeństwa Pomorze Środkowe"}}
|
|
{{"relevant": true, "score": 5, "reason": "Inwestycja Kongsberg w Rumi"}}
|
|
{{"relevant": true, "score": 5, "reason": "Akademia Biznesu NORDA / Izba NORDA"}}
|
|
{{"relevant": true, "score": 4, "reason": "Droga Czerwona Port Gdynia"}}
|
|
{{"relevant": true, "score": 4, "reason": "Dotyczy farm wiatrowych Baltic Power"}}
|
|
{{"relevant": true, "score": 4, "reason": "S6 Koszalin-Słupsk otwarcie"}}
|
|
{{"relevant": true, "score": 3, "reason": "Ogólne informacje o offshore wind w Polsce"}}
|
|
{{"relevant": false, "score": 2, "reason": "Artykuł o energetyce, ale nie dotyczy Bałtyku"}}
|
|
{{"relevant": false, "score": 1, "reason": "Turystyka/wydarzenia lokalne bez przemysłu"}}
|
|
{{"relevant": false, "score": 1, "reason": "Polityka ogólnopolska bez związku z Pomorzem"}}"""
|
|
|
|
|
|
def evaluate_news_relevance(news_item, gemini_service=None, user_id: int = None) -> Dict:
|
|
"""
|
|
Evaluate a single news item for ZOPK relevance using Gemini AI.
|
|
|
|
Args:
|
|
news_item: ZOPKNews object or dict with title, description, source_name, published_at
|
|
gemini_service: Optional GeminiService instance (uses global if not provided)
|
|
user_id: ID of the user triggering the evaluation (for cost tracking)
|
|
|
|
Returns:
|
|
Dict with keys: relevant (bool), reason (str), evaluated (bool)
|
|
"""
|
|
import json
|
|
|
|
# Get Gemini service
|
|
if gemini_service is None:
|
|
try:
|
|
from gemini_service import get_gemini_service
|
|
gemini_service = get_gemini_service()
|
|
except Exception as e:
|
|
logger.error(f"Failed to get Gemini service: {e}")
|
|
return {'relevant': None, 'reason': 'Gemini service unavailable', 'evaluated': False}
|
|
|
|
if gemini_service is None:
|
|
return {'relevant': None, 'reason': 'Gemini service not initialized', 'evaluated': False}
|
|
|
|
# Extract fields from news_item
|
|
if hasattr(news_item, 'title'):
|
|
title = news_item.title or ''
|
|
description = news_item.description or ''
|
|
source = news_item.source_name or news_item.source_domain or ''
|
|
date = news_item.published_at.strftime('%Y-%m-%d') if news_item.published_at else ''
|
|
else:
|
|
title = news_item.get('title', '')
|
|
description = news_item.get('description', '')
|
|
source = news_item.get('source_name', '')
|
|
date = news_item.get('published_at', '')
|
|
|
|
# Build prompt
|
|
prompt = ZOPK_AI_EVALUATION_PROMPT.format(
|
|
title=title[:500], # Limit length
|
|
description=description[:1000] if description else 'Brak opisu',
|
|
source=source[:100],
|
|
date=date
|
|
)
|
|
|
|
try:
|
|
# Call Gemini with low temperature for consistent results
|
|
response = gemini_service.generate_text(
|
|
prompt,
|
|
temperature=0.1,
|
|
feature='zopk_news_evaluation',
|
|
user_id=user_id
|
|
)
|
|
|
|
# Parse JSON response
|
|
# Try to extract JSON from response (handle markdown code blocks)
|
|
json_match = re.search(r'\{[^{}]*\}', response)
|
|
if json_match:
|
|
result = json.loads(json_match.group())
|
|
# Extract score (1-5), default to 3 if not present
|
|
score = int(result.get('score', 3))
|
|
score = max(1, min(5, score)) # Clamp to 1-5 range
|
|
return {
|
|
'relevant': bool(result.get('relevant', score >= 3)),
|
|
'score': score,
|
|
'reason': str(result.get('reason', ''))[:255],
|
|
'evaluated': True
|
|
}
|
|
else:
|
|
logger.warning(f"Could not parse Gemini response: {response[:200]}")
|
|
return {'relevant': None, 'score': None, 'reason': 'Invalid AI response format', 'evaluated': False}
|
|
|
|
except json.JSONDecodeError as e:
|
|
logger.error(f"JSON decode error: {e}")
|
|
return {'relevant': None, 'score': None, 'reason': f'JSON parse error: {str(e)[:50]}', 'evaluated': False}
|
|
except Exception as e:
|
|
logger.error(f"Gemini evaluation error: {e}")
|
|
return {'relevant': None, 'score': None, 'reason': f'AI error: {str(e)[:50]}', 'evaluated': False}
|
|
|
|
|
|
def reevaluate_news_without_score(db_session, limit: int = 50, user_id: int = None) -> Dict:
|
|
"""
|
|
Re-evaluate news items that have ai_relevant but no ai_relevance_score.
|
|
Used to upgrade old binary evaluations to the new 1-5 star system.
|
|
|
|
Args:
|
|
db_session: SQLAlchemy session
|
|
limit: Max number of items to evaluate (to avoid API limits)
|
|
user_id: User triggering the evaluation (for logging)
|
|
|
|
Returns:
|
|
Dict with stats: total_evaluated, relevant_count, not_relevant_count, errors
|
|
"""
|
|
from database import ZOPKNews
|
|
from datetime import datetime
|
|
|
|
# Get news that have been evaluated (ai_relevant is set) but missing score
|
|
news_to_rescore = db_session.query(ZOPKNews).filter(
|
|
ZOPKNews.ai_relevant.isnot(None), # Already evaluated
|
|
ZOPKNews.ai_relevance_score.is_(None) # But missing score
|
|
).order_by(ZOPKNews.created_at.desc()).limit(limit).all()
|
|
|
|
if not news_to_rescore:
|
|
return {
|
|
'total_evaluated': 0,
|
|
'relevant_count': 0,
|
|
'not_relevant_count': 0,
|
|
'errors': 0,
|
|
'message': 'Wszystkie newsy mają już ocenę gwiazdkową'
|
|
}
|
|
|
|
# Get Gemini service once
|
|
try:
|
|
from gemini_service import get_gemini_service
|
|
gemini = get_gemini_service()
|
|
except Exception as e:
|
|
return {
|
|
'total_evaluated': 0,
|
|
'relevant_count': 0,
|
|
'not_relevant_count': 0,
|
|
'errors': 1,
|
|
'message': f'Gemini service error: {str(e)}'
|
|
}
|
|
|
|
stats = {
|
|
'total_evaluated': 0,
|
|
'relevant_count': 0,
|
|
'not_relevant_count': 0,
|
|
'errors': 0
|
|
}
|
|
|
|
for news in news_to_rescore:
|
|
result = evaluate_news_relevance(news, gemini, user_id=user_id)
|
|
|
|
if result['evaluated']:
|
|
news.ai_relevant = result['relevant']
|
|
news.ai_relevance_score = result.get('score') # 1-5 stars
|
|
news.ai_evaluation_reason = result['reason']
|
|
news.ai_evaluated_at = datetime.now()
|
|
news.ai_model = 'gemini-3-flash-preview'
|
|
|
|
stats['total_evaluated'] += 1
|
|
if result['relevant']:
|
|
stats['relevant_count'] += 1
|
|
else:
|
|
stats['not_relevant_count'] += 1
|
|
else:
|
|
stats['errors'] += 1
|
|
logger.warning(f"Failed to re-evaluate news {news.id}: {result['reason']}")
|
|
|
|
# Commit all changes
|
|
try:
|
|
db_session.commit()
|
|
stats['message'] = f"Przeoceniono {stats['total_evaluated']} newsów: {stats['relevant_count']} pasuje, {stats['not_relevant_count']} nie pasuje"
|
|
except Exception as e:
|
|
db_session.rollback()
|
|
stats['errors'] += 1
|
|
stats['message'] = f'Database error: {str(e)}'
|
|
|
|
return stats
|
|
|
|
|
|
def reevaluate_low_score_news(db_session, limit: int = 50, user_id: int = None) -> Dict:
|
|
"""
|
|
Re-evaluate news with low AI scores (1-2★) that contain key ZOPK topics.
|
|
|
|
This is useful after updating the AI prompt to include new topics like
|
|
Via Pomerania, S6, NORDA, etc. Old articles may have been scored low
|
|
before these topics were added to the prompt.
|
|
|
|
Key topics that trigger re-evaluation:
|
|
- Via Pomerania (droga ekspresowa)
|
|
- S6, S7 (autostrady)
|
|
- Droga Czerwona
|
|
- NORDA, Izba Przedsiębiorców
|
|
- Pakt Bezpieczeństwa
|
|
- Deklaracja Bałtycka
|
|
|
|
Args:
|
|
db_session: SQLAlchemy session
|
|
limit: Max number of items to evaluate (to avoid API limits)
|
|
user_id: User triggering the evaluation (for logging)
|
|
|
|
Returns:
|
|
Dict with stats: total_evaluated, upgraded, downgraded, unchanged, errors
|
|
"""
|
|
from database import ZOPKNews
|
|
from datetime import datetime
|
|
from sqlalchemy import or_, func
|
|
|
|
# Key topics that should trigger re-evaluation
|
|
KEY_TOPICS = [
|
|
'via pomerania', 'via-pomerania',
|
|
'droga czerwona',
|
|
'pakt bezpieczeństwa', 'pakt dla bezpieczeństwa',
|
|
'deklaracja bałtycka',
|
|
'norda biznes', 'izba przedsiębiorców norda', 'akademia biznesu norda',
|
|
's6 koszalin', 's6 słupsk',
|
|
's7 gdańsk', 's7 elbląg',
|
|
]
|
|
|
|
# Build filter for news with low scores containing key topics
|
|
topic_filters = []
|
|
for topic in KEY_TOPICS:
|
|
topic_filters.append(func.lower(ZOPKNews.title).contains(topic))
|
|
topic_filters.append(func.lower(ZOPKNews.description).contains(topic))
|
|
|
|
# Get news with score 1-2 that contain key topics
|
|
news_to_rescore = db_session.query(ZOPKNews).filter(
|
|
ZOPKNews.ai_relevance_score.isnot(None), # Has been evaluated
|
|
ZOPKNews.ai_relevance_score <= 2, # Low score (1-2★)
|
|
or_(*topic_filters) # Contains key topics
|
|
).order_by(ZOPKNews.created_at.desc()).limit(limit).all()
|
|
|
|
if not news_to_rescore:
|
|
return {
|
|
'total_evaluated': 0,
|
|
'upgraded': 0,
|
|
'downgraded': 0,
|
|
'unchanged': 0,
|
|
'errors': 0,
|
|
'message': 'Brak newsów z niską oceną zawierających kluczowe tematy'
|
|
}
|
|
|
|
# Get Gemini service once
|
|
try:
|
|
from gemini_service import get_gemini_service
|
|
gemini = get_gemini_service()
|
|
except Exception as e:
|
|
return {
|
|
'total_evaluated': 0,
|
|
'upgraded': 0,
|
|
'downgraded': 0,
|
|
'unchanged': 0,
|
|
'errors': 1,
|
|
'message': f'Gemini service error: {str(e)}'
|
|
}
|
|
|
|
stats = {
|
|
'total_evaluated': 0,
|
|
'upgraded': 0,
|
|
'downgraded': 0,
|
|
'unchanged': 0,
|
|
'errors': 0,
|
|
'details': [] # Track individual changes
|
|
}
|
|
|
|
for news in news_to_rescore:
|
|
old_score = news.ai_relevance_score
|
|
result = evaluate_news_relevance(news, gemini, user_id=user_id)
|
|
|
|
if result['evaluated']:
|
|
new_score = result.get('score', old_score)
|
|
|
|
# Update the news item
|
|
news.ai_relevant = result['relevant']
|
|
news.ai_relevance_score = new_score
|
|
news.ai_evaluation_reason = result['reason']
|
|
news.ai_evaluated_at = datetime.now()
|
|
news.ai_model = 'gemini-3-flash-preview' # Gemini 3 Flash
|
|
|
|
# Track change
|
|
stats['total_evaluated'] += 1
|
|
if new_score > old_score:
|
|
stats['upgraded'] += 1
|
|
# If score is now >= 3, auto-approve
|
|
if new_score >= 3 and news.status in ('pending', 'rejected'):
|
|
news.status = 'auto_approved'
|
|
news.is_auto_verified = True
|
|
elif new_score < old_score:
|
|
stats['downgraded'] += 1
|
|
else:
|
|
stats['unchanged'] += 1
|
|
|
|
stats['details'].append({
|
|
'id': news.id,
|
|
'title': news.title[:50],
|
|
'old_score': old_score,
|
|
'new_score': new_score,
|
|
'change': 'upgraded' if new_score > old_score else ('downgraded' if new_score < old_score else 'unchanged')
|
|
})
|
|
|
|
logger.info(f"Re-evaluated news {news.id}: {old_score}★ → {new_score}★")
|
|
else:
|
|
stats['errors'] += 1
|
|
logger.warning(f"Failed to re-evaluate news {news.id}: {result['reason']}")
|
|
|
|
# Commit all changes
|
|
try:
|
|
db_session.commit()
|
|
stats['message'] = f"Re-ewaluacja {stats['total_evaluated']} newsów: {stats['upgraded']} podwyższono, {stats['downgraded']} obniżono, {stats['unchanged']} bez zmian"
|
|
except Exception as e:
|
|
db_session.rollback()
|
|
stats['errors'] += 1
|
|
stats['message'] = f'Database error: {str(e)}'
|
|
|
|
return stats
|
|
|
|
|
|
def evaluate_pending_news(db_session, limit: int = 50, user_id: int = None) -> Dict:
|
|
"""
|
|
Evaluate multiple pending news items for ZOPK relevance.
|
|
|
|
Args:
|
|
db_session: SQLAlchemy session
|
|
limit: Max number of items to evaluate (to avoid API limits)
|
|
user_id: User triggering the evaluation (for logging)
|
|
|
|
Returns:
|
|
Dict with stats: total_evaluated, relevant_count, not_relevant_count, errors
|
|
"""
|
|
from database import ZOPKNews
|
|
from datetime import datetime
|
|
|
|
# Get pending news that haven't been AI-evaluated yet
|
|
pending_news = db_session.query(ZOPKNews).filter(
|
|
ZOPKNews.status == 'pending',
|
|
ZOPKNews.ai_relevant.is_(None) # Not yet evaluated
|
|
).order_by(ZOPKNews.created_at.desc()).limit(limit).all()
|
|
|
|
if not pending_news:
|
|
return {
|
|
'total_evaluated': 0,
|
|
'relevant_count': 0,
|
|
'not_relevant_count': 0,
|
|
'errors': 0,
|
|
'message': 'Brak newsów do oceny'
|
|
}
|
|
|
|
# Get Gemini service once
|
|
try:
|
|
from gemini_service import get_gemini_service
|
|
gemini = get_gemini_service()
|
|
except Exception as e:
|
|
return {
|
|
'total_evaluated': 0,
|
|
'relevant_count': 0,
|
|
'not_relevant_count': 0,
|
|
'errors': 1,
|
|
'message': f'Gemini service error: {str(e)}'
|
|
}
|
|
|
|
stats = {
|
|
'total_evaluated': 0,
|
|
'relevant_count': 0,
|
|
'not_relevant_count': 0,
|
|
'errors': 0
|
|
}
|
|
|
|
for news in pending_news:
|
|
result = evaluate_news_relevance(news, gemini, user_id=user_id)
|
|
|
|
if result['evaluated']:
|
|
news.ai_relevant = result['relevant']
|
|
news.ai_relevance_score = result.get('score') # 1-5 stars
|
|
news.ai_evaluation_reason = result['reason']
|
|
news.ai_evaluated_at = datetime.now()
|
|
news.ai_model = 'gemini-3-flash-preview'
|
|
|
|
stats['total_evaluated'] += 1
|
|
if result['relevant']:
|
|
stats['relevant_count'] += 1
|
|
else:
|
|
stats['not_relevant_count'] += 1
|
|
else:
|
|
stats['errors'] += 1
|
|
logger.warning(f"Failed to evaluate news {news.id}: {result['reason']}")
|
|
|
|
# Commit all changes
|
|
try:
|
|
db_session.commit()
|
|
stats['message'] = f"Oceniono {stats['total_evaluated']} newsów: {stats['relevant_count']} pasuje, {stats['not_relevant_count']} nie pasuje"
|
|
except Exception as e:
|
|
db_session.rollback()
|
|
stats['errors'] += 1
|
|
stats['message'] = f'Database error: {str(e)}'
|
|
|
|
return stats
|