feat(zopk): Knowledge Base + NordaGPT integration (FAZY 0-3)
FAZA 0 - Web Scraping: - Migracja 015: pola full_content, scrape_status w zopk_news - zopk_content_scraper.py: scraper z rate limiting i selektorami FAZA 1 - Knowledge Extraction: - zopk_knowledge_service.py: chunking, facts, entities extraction - Endpointy /admin/zopk/knowledge/extract FAZA 2 - Embeddings: - gemini_service.py: generate_embedding(), generate_embeddings_batch() - Model text-embedding-004 (768 dimensions) FAZA 3 - NordaGPT Integration: - nordabiz_chat.py: _is_zopk_query(), _get_zopk_knowledge_context() - System prompt z bazą wiedzy ZOPK - Semantic search w kontekście chatu Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
0f1cf6176a
commit
1b4cd31c41
325
app.py
325
app.py
@ -10849,6 +10849,331 @@ def api_zopk_search_news():
|
||||
db.close()
|
||||
|
||||
|
||||
# ============================================================
|
||||
# ZOPK CONTENT SCRAPING (Knowledge Base Pipeline)
|
||||
# ============================================================
|
||||
|
||||
@app.route('/admin/zopk/news/scrape-stats')
|
||||
@login_required
|
||||
def admin_zopk_scrape_stats():
|
||||
"""
|
||||
Get content scraping statistics.
|
||||
|
||||
Returns JSON with:
|
||||
- total_approved: Total approved/auto_approved articles
|
||||
- scraped: Successfully scraped articles
|
||||
- pending: Articles waiting to be scraped
|
||||
- failed: Failed scraping attempts
|
||||
- skipped: Skipped (social media, paywalls)
|
||||
- ready_for_extraction: Scraped but not yet processed for knowledge
|
||||
"""
|
||||
if not current_user.is_admin:
|
||||
return jsonify({'success': False, 'error': 'Brak uprawnień'}), 403
|
||||
|
||||
from zopk_content_scraper import get_scrape_stats
|
||||
|
||||
db = SessionLocal()
|
||||
try:
|
||||
stats = get_scrape_stats(db)
|
||||
return jsonify({
|
||||
'success': True,
|
||||
**stats
|
||||
})
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting scrape stats: {e}")
|
||||
return jsonify({'success': False, 'error': str(e)}), 500
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@app.route('/admin/zopk/news/scrape-content', methods=['POST'])
|
||||
@login_required
|
||||
def admin_zopk_scrape_content():
|
||||
"""
|
||||
Batch scrape article content from source URLs.
|
||||
|
||||
Request JSON:
|
||||
- limit: int (default 50) - max articles to scrape
|
||||
- force: bool (default false) - re-scrape already scraped
|
||||
|
||||
Response:
|
||||
- scraped: number of successfully scraped
|
||||
- failed: number of failures
|
||||
- skipped: number of skipped (social media, etc.)
|
||||
- errors: list of error details
|
||||
- scraped_articles: list of scraped article info
|
||||
"""
|
||||
if not current_user.is_admin:
|
||||
return jsonify({'success': False, 'error': 'Brak uprawnień'}), 403
|
||||
|
||||
from zopk_content_scraper import ZOPKContentScraper
|
||||
|
||||
db = SessionLocal()
|
||||
try:
|
||||
data = request.get_json() or {}
|
||||
limit = min(data.get('limit', 50), 100) # Max 100 at once
|
||||
force = data.get('force', False)
|
||||
|
||||
scraper = ZOPKContentScraper(db, user_id=current_user.id)
|
||||
result = scraper.batch_scrape(limit=limit, force=force)
|
||||
|
||||
return jsonify({
|
||||
'success': True,
|
||||
'message': f"Scraping zakończony: {result['scraped']} pobrano, "
|
||||
f"{result['failed']} błędów, {result['skipped']} pominięto",
|
||||
**result
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
db.rollback()
|
||||
logger.error(f"Error in batch scrape: {e}")
|
||||
return jsonify({'success': False, 'error': str(e)}), 500
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@app.route('/admin/zopk/news/<int:news_id>/scrape', methods=['POST'])
|
||||
@login_required
|
||||
def admin_zopk_scrape_single(news_id):
|
||||
"""
|
||||
Scrape content for a single article.
|
||||
"""
|
||||
if not current_user.is_admin:
|
||||
return jsonify({'success': False, 'error': 'Brak uprawnień'}), 403
|
||||
|
||||
from zopk_content_scraper import ZOPKContentScraper
|
||||
|
||||
db = SessionLocal()
|
||||
try:
|
||||
scraper = ZOPKContentScraper(db, user_id=current_user.id)
|
||||
result = scraper.scrape_article(news_id)
|
||||
|
||||
if result.success:
|
||||
return jsonify({
|
||||
'success': True,
|
||||
'message': f"Pobrano treść: {result.word_count} słów",
|
||||
'word_count': result.word_count,
|
||||
'status': result.status
|
||||
})
|
||||
else:
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'error': result.error,
|
||||
'status': result.status
|
||||
}), 400
|
||||
|
||||
except Exception as e:
|
||||
db.rollback()
|
||||
logger.error(f"Error scraping article {news_id}: {e}")
|
||||
return jsonify({'success': False, 'error': str(e)}), 500
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
# ============================================================
|
||||
# ZOPK KNOWLEDGE EXTRACTION (AI-powered)
|
||||
# ============================================================
|
||||
|
||||
@app.route('/admin/zopk/knowledge/stats')
|
||||
@login_required
|
||||
def admin_zopk_knowledge_stats():
|
||||
"""
|
||||
Get knowledge extraction statistics.
|
||||
|
||||
Returns:
|
||||
- articles: stats about articles (approved, scraped, extracted)
|
||||
- knowledge_base: stats about chunks, facts, entities, relations
|
||||
- top_entities: most mentioned entities
|
||||
"""
|
||||
if not current_user.is_admin:
|
||||
return jsonify({'success': False, 'error': 'Brak uprawnień'}), 403
|
||||
|
||||
from zopk_knowledge_service import get_knowledge_stats
|
||||
|
||||
db = SessionLocal()
|
||||
try:
|
||||
stats = get_knowledge_stats(db)
|
||||
return jsonify({
|
||||
'success': True,
|
||||
**stats
|
||||
})
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting knowledge stats: {e}")
|
||||
return jsonify({'success': False, 'error': str(e)}), 500
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@app.route('/admin/zopk/knowledge/extract', methods=['POST'])
|
||||
@login_required
|
||||
def admin_zopk_knowledge_extract():
|
||||
"""
|
||||
Batch extract knowledge from scraped articles.
|
||||
|
||||
Request JSON:
|
||||
- limit: int (default 50) - max articles to process
|
||||
|
||||
Response:
|
||||
- success/failed counts
|
||||
- chunks/facts/entities/relations created
|
||||
- errors list
|
||||
"""
|
||||
if not current_user.is_admin:
|
||||
return jsonify({'success': False, 'error': 'Brak uprawnień'}), 403
|
||||
|
||||
from zopk_knowledge_service import ZOPKKnowledgeService
|
||||
|
||||
db = SessionLocal()
|
||||
try:
|
||||
data = request.get_json() or {}
|
||||
limit = min(data.get('limit', 50), 100)
|
||||
|
||||
service = ZOPKKnowledgeService(db, user_id=current_user.id)
|
||||
result = service.batch_extract(limit=limit)
|
||||
|
||||
return jsonify({
|
||||
'success': True,
|
||||
'message': f"Ekstrakcja zakończona: {result['success']}/{result['total']} artykułów. "
|
||||
f"Utworzono: {result['chunks_created']} chunks, {result['facts_created']} faktów, "
|
||||
f"{result['entities_created']} encji, {result['relations_created']} relacji.",
|
||||
**result
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
db.rollback()
|
||||
logger.error(f"Error in knowledge extraction: {e}")
|
||||
return jsonify({'success': False, 'error': str(e)}), 500
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@app.route('/admin/zopk/knowledge/extract/<int:news_id>', methods=['POST'])
|
||||
@login_required
|
||||
def admin_zopk_knowledge_extract_single(news_id):
|
||||
"""
|
||||
Extract knowledge from a single article.
|
||||
"""
|
||||
if not current_user.is_admin:
|
||||
return jsonify({'success': False, 'error': 'Brak uprawnień'}), 403
|
||||
|
||||
from zopk_knowledge_service import ZOPKKnowledgeService
|
||||
|
||||
db = SessionLocal()
|
||||
try:
|
||||
service = ZOPKKnowledgeService(db, user_id=current_user.id)
|
||||
result = service.extract_from_news(news_id)
|
||||
|
||||
if result.success:
|
||||
return jsonify({
|
||||
'success': True,
|
||||
'message': f"Wyekstrahowano: {result.chunks_created} chunks, "
|
||||
f"{result.facts_created} faktów, {result.entities_created} encji",
|
||||
'chunks_created': result.chunks_created,
|
||||
'facts_created': result.facts_created,
|
||||
'entities_created': result.entities_created,
|
||||
'relations_created': result.relations_created,
|
||||
'processing_time': result.processing_time
|
||||
})
|
||||
else:
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'error': result.error
|
||||
}), 400
|
||||
|
||||
except Exception as e:
|
||||
db.rollback()
|
||||
logger.error(f"Error extracting from news {news_id}: {e}")
|
||||
return jsonify({'success': False, 'error': str(e)}), 500
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@app.route('/admin/zopk/knowledge/embeddings', methods=['POST'])
|
||||
@login_required
|
||||
def admin_zopk_generate_embeddings():
|
||||
"""
|
||||
Generate embeddings for chunks that don't have them.
|
||||
|
||||
Request JSON:
|
||||
- limit: int (default 100) - max chunks to process
|
||||
"""
|
||||
if not current_user.is_admin:
|
||||
return jsonify({'success': False, 'error': 'Brak uprawnień'}), 403
|
||||
|
||||
from zopk_knowledge_service import generate_chunk_embeddings
|
||||
|
||||
db = SessionLocal()
|
||||
try:
|
||||
data = request.get_json() or {}
|
||||
limit = min(data.get('limit', 100), 500)
|
||||
|
||||
result = generate_chunk_embeddings(db, limit=limit, user_id=current_user.id)
|
||||
|
||||
return jsonify({
|
||||
'success': True,
|
||||
'message': f"Wygenerowano embeddings: {result['success']}/{result['total']}",
|
||||
**result
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
db.rollback()
|
||||
logger.error(f"Error generating embeddings: {e}")
|
||||
return jsonify({'success': False, 'error': str(e)}), 500
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@app.route('/api/zopk/knowledge/search', methods=['POST'])
|
||||
@login_required
|
||||
def api_zopk_knowledge_search():
|
||||
"""
|
||||
Semantic search in ZOPK knowledge base.
|
||||
|
||||
Request JSON:
|
||||
- query: str (required) - search query
|
||||
- limit: int (default 5) - max results
|
||||
|
||||
Response:
|
||||
- chunks: list of matching knowledge chunks with similarity scores
|
||||
- facts: list of relevant facts
|
||||
"""
|
||||
from zopk_knowledge_service import search_knowledge, get_relevant_facts
|
||||
|
||||
db = SessionLocal()
|
||||
try:
|
||||
data = request.get_json() or {}
|
||||
query = data.get('query', '')
|
||||
|
||||
if not query:
|
||||
return jsonify({'success': False, 'error': 'Query wymagane'}), 400
|
||||
|
||||
limit = min(data.get('limit', 5), 20)
|
||||
|
||||
# Search chunks
|
||||
chunks = search_knowledge(
|
||||
db,
|
||||
query=query,
|
||||
limit=limit,
|
||||
user_id=current_user.id
|
||||
)
|
||||
|
||||
# Get relevant facts
|
||||
facts = get_relevant_facts(db, query=query, limit=limit)
|
||||
|
||||
return jsonify({
|
||||
'success': True,
|
||||
'query': query,
|
||||
'chunks': chunks,
|
||||
'facts': facts
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in knowledge search: {e}")
|
||||
return jsonify({'success': False, 'error': str(e)}), 500
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
# ============================================================
|
||||
# KRS AUDIT (Krajowy Rejestr Sądowy)
|
||||
# ============================================================
|
||||
|
||||
13
database.py
13
database.py
@ -1902,6 +1902,19 @@ class ZOPKNews(Base):
|
||||
is_featured = Column(Boolean, default=False)
|
||||
views_count = Column(Integer, default=0)
|
||||
|
||||
# Full content (scraped from source URL) - for knowledge extraction
|
||||
full_content = Column(Text) # Full article text (without HTML, ads, navigation)
|
||||
content_scraped_at = Column(DateTime) # When content was scraped
|
||||
scrape_status = Column(String(20), default='pending', index=True) # pending, scraped, failed, skipped
|
||||
scrape_error = Column(Text) # Error message if scraping failed
|
||||
scrape_attempts = Column(Integer, default=0) # Number of scraping attempts
|
||||
content_word_count = Column(Integer) # Word count of scraped content
|
||||
content_language = Column(String(10), default='pl') # pl, en
|
||||
|
||||
# Knowledge extraction status
|
||||
knowledge_extracted = Column(Boolean, default=False, index=True) # True if chunks/facts/entities extracted
|
||||
knowledge_extracted_at = Column(DateTime) # When knowledge was extracted
|
||||
|
||||
created_at = Column(DateTime, default=datetime.now)
|
||||
updated_at = Column(DateTime, default=datetime.now, onupdate=datetime.now)
|
||||
|
||||
|
||||
58
database/migrations/015_zopk_full_content.sql
Normal file
58
database/migrations/015_zopk_full_content.sql
Normal file
@ -0,0 +1,58 @@
|
||||
-- Migration 015: Add full_content fields to zopk_news for knowledge base extraction
|
||||
-- Date: 2026-01-16
|
||||
-- Purpose: Store scraped article content for AI knowledge extraction
|
||||
|
||||
-- ============================================================
|
||||
-- ADD NEW COLUMNS TO zopk_news
|
||||
-- ============================================================
|
||||
|
||||
-- Full article content (scraped from source URL)
|
||||
ALTER TABLE zopk_news ADD COLUMN IF NOT EXISTS full_content TEXT;
|
||||
|
||||
-- Content scraping metadata
|
||||
ALTER TABLE zopk_news ADD COLUMN IF NOT EXISTS content_scraped_at TIMESTAMP;
|
||||
ALTER TABLE zopk_news ADD COLUMN IF NOT EXISTS scrape_status VARCHAR(20) DEFAULT 'pending';
|
||||
-- Status values: pending, scraped, failed, skipped
|
||||
|
||||
-- Scraping error tracking
|
||||
ALTER TABLE zopk_news ADD COLUMN IF NOT EXISTS scrape_error TEXT;
|
||||
ALTER TABLE zopk_news ADD COLUMN IF NOT EXISTS scrape_attempts INTEGER DEFAULT 0;
|
||||
|
||||
-- Content metadata (extracted during scraping)
|
||||
ALTER TABLE zopk_news ADD COLUMN IF NOT EXISTS content_word_count INTEGER;
|
||||
ALTER TABLE zopk_news ADD COLUMN IF NOT EXISTS content_language VARCHAR(10) DEFAULT 'pl';
|
||||
|
||||
-- Knowledge extraction status
|
||||
ALTER TABLE zopk_news ADD COLUMN IF NOT EXISTS knowledge_extracted BOOLEAN DEFAULT FALSE;
|
||||
ALTER TABLE zopk_news ADD COLUMN IF NOT EXISTS knowledge_extracted_at TIMESTAMP;
|
||||
|
||||
-- ============================================================
|
||||
-- INDEXES FOR EFFICIENT QUERYING
|
||||
-- ============================================================
|
||||
|
||||
-- Index for finding articles to scrape
|
||||
CREATE INDEX IF NOT EXISTS idx_zopk_news_scrape_status ON zopk_news(scrape_status);
|
||||
|
||||
-- Index for finding articles ready for knowledge extraction
|
||||
CREATE INDEX IF NOT EXISTS idx_zopk_news_knowledge_extracted ON zopk_news(knowledge_extracted);
|
||||
|
||||
-- Composite index for scraping pipeline
|
||||
CREATE INDEX IF NOT EXISTS idx_zopk_news_scrape_pipeline
|
||||
ON zopk_news(status, scrape_status, knowledge_extracted);
|
||||
|
||||
-- ============================================================
|
||||
-- COMMENTS
|
||||
-- ============================================================
|
||||
|
||||
COMMENT ON COLUMN zopk_news.full_content IS 'Full article text scraped from source URL (without HTML, ads, navigation)';
|
||||
COMMENT ON COLUMN zopk_news.scrape_status IS 'pending=not scraped, scraped=success, failed=error, skipped=not scrapeable';
|
||||
COMMENT ON COLUMN zopk_news.scrape_error IS 'Error message if scraping failed';
|
||||
COMMENT ON COLUMN zopk_news.scrape_attempts IS 'Number of scraping attempts (for retry logic)';
|
||||
COMMENT ON COLUMN zopk_news.content_word_count IS 'Word count of scraped content';
|
||||
COMMENT ON COLUMN zopk_news.knowledge_extracted IS 'True if chunks/facts/entities extracted';
|
||||
|
||||
-- ============================================================
|
||||
-- GRANT PERMISSIONS
|
||||
-- ============================================================
|
||||
|
||||
GRANT ALL ON TABLE zopk_news TO nordabiz_app;
|
||||
@ -404,6 +404,126 @@ class GeminiService:
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to log API cost: {e}")
|
||||
|
||||
def generate_embedding(
|
||||
self,
|
||||
text: str,
|
||||
task_type: str = 'retrieval_document',
|
||||
title: Optional[str] = None,
|
||||
user_id: Optional[int] = None,
|
||||
feature: str = 'embedding'
|
||||
) -> Optional[List[float]]:
|
||||
"""
|
||||
Generate embedding vector for text using Google's text-embedding model.
|
||||
|
||||
Args:
|
||||
text: Text to embed
|
||||
task_type: One of:
|
||||
- 'retrieval_document': For documents to be retrieved
|
||||
- 'retrieval_query': For search queries
|
||||
- 'semantic_similarity': For comparing texts
|
||||
- 'classification': For text classification
|
||||
- 'clustering': For text clustering
|
||||
title: Optional title for document (improves quality)
|
||||
user_id: User ID for cost tracking
|
||||
feature: Feature name for cost tracking
|
||||
|
||||
Returns:
|
||||
768-dimensional embedding vector or None on error
|
||||
|
||||
Cost: ~$0.00001 per 1K tokens (very cheap)
|
||||
"""
|
||||
if not text or not text.strip():
|
||||
logger.warning("Empty text provided for embedding")
|
||||
return None
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
# Use text-embedding-004 model (768 dimensions)
|
||||
# This is Google's recommended model for embeddings
|
||||
result = genai.embed_content(
|
||||
model='models/text-embedding-004',
|
||||
content=text,
|
||||
task_type=task_type,
|
||||
title=title
|
||||
)
|
||||
|
||||
embedding = result.get('embedding')
|
||||
|
||||
if not embedding:
|
||||
logger.error("No embedding returned from API")
|
||||
return None
|
||||
|
||||
# Log cost (embedding API is very cheap)
|
||||
latency_ms = int((time.time() - start_time) * 1000)
|
||||
token_count = len(text) // 4 # Approximate
|
||||
|
||||
# Embedding pricing: ~$0.00001 per 1K tokens
|
||||
cost_usd = (token_count / 1000) * 0.00001
|
||||
|
||||
logger.debug(
|
||||
f"Embedding generated: {len(embedding)} dims, "
|
||||
f"{token_count} tokens, {latency_ms}ms, ${cost_usd:.8f}"
|
||||
)
|
||||
|
||||
# Log to database (if cost tracking is important)
|
||||
if DB_AVAILABLE and user_id:
|
||||
try:
|
||||
db = SessionLocal()
|
||||
try:
|
||||
usage_log = AIUsageLog(
|
||||
request_type=feature,
|
||||
model='text-embedding-004',
|
||||
tokens_input=token_count,
|
||||
tokens_output=0,
|
||||
cost_cents=cost_usd * 100,
|
||||
user_id=user_id,
|
||||
prompt_length=len(text),
|
||||
response_length=len(embedding) * 4, # 4 bytes per float
|
||||
response_time_ms=latency_ms,
|
||||
success=True
|
||||
)
|
||||
db.add(usage_log)
|
||||
db.commit()
|
||||
finally:
|
||||
db.close()
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to log embedding cost: {e}")
|
||||
|
||||
return embedding
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Embedding generation error: {e}")
|
||||
return None
|
||||
|
||||
def generate_embeddings_batch(
|
||||
self,
|
||||
texts: List[str],
|
||||
task_type: str = 'retrieval_document',
|
||||
user_id: Optional[int] = None
|
||||
) -> List[Optional[List[float]]]:
|
||||
"""
|
||||
Generate embeddings for multiple texts.
|
||||
|
||||
Args:
|
||||
texts: List of texts to embed
|
||||
task_type: Task type for all embeddings
|
||||
user_id: User ID for cost tracking
|
||||
|
||||
Returns:
|
||||
List of embedding vectors (None for failed items)
|
||||
"""
|
||||
results = []
|
||||
for text in texts:
|
||||
embedding = self.generate_embedding(
|
||||
text=text,
|
||||
task_type=task_type,
|
||||
user_id=user_id,
|
||||
feature='embedding_batch'
|
||||
)
|
||||
results.append(embedding)
|
||||
return results
|
||||
|
||||
|
||||
# Global service instance (initialized in app.py)
|
||||
_gemini_service: Optional[GeminiService] = None
|
||||
|
||||
203
nordabiz_chat.py
203
nordabiz_chat.py
@ -18,12 +18,16 @@ Created: 2025-11-23
|
||||
|
||||
import os
|
||||
import time
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Any, Optional
|
||||
import google.generativeai as genai
|
||||
import gemini_service
|
||||
from search_service import search_companies
|
||||
|
||||
# Module logger
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
from database import (
|
||||
SessionLocal,
|
||||
Company,
|
||||
@ -58,6 +62,13 @@ try:
|
||||
except ImportError:
|
||||
FEEDBACK_LEARNING_AVAILABLE = False
|
||||
|
||||
# Import ZOPK knowledge service for semantic search
|
||||
try:
|
||||
from zopk_knowledge_service import search_knowledge, get_relevant_facts
|
||||
ZOPK_KNOWLEDGE_AVAILABLE = True
|
||||
except ImportError:
|
||||
ZOPK_KNOWLEDGE_AVAILABLE = False
|
||||
|
||||
|
||||
class NordaBizChatEngine:
|
||||
"""
|
||||
@ -347,7 +358,7 @@ class NordaBizChatEngine:
|
||||
from datetime import timedelta
|
||||
news_cutoff = datetime.now() - timedelta(days=30)
|
||||
recent_news = db.query(ZOPKNews).filter(
|
||||
ZOPKNews.status == 'approved',
|
||||
ZOPKNews.status.in_(['approved', 'auto_approved']),
|
||||
ZOPKNews.published_at >= news_cutoff
|
||||
).order_by(ZOPKNews.published_at.desc()).limit(10).all()
|
||||
|
||||
@ -361,6 +372,12 @@ class NordaBizChatEngine:
|
||||
for news in recent_news
|
||||
]
|
||||
|
||||
# === ZOPK KNOWLEDGE BASE (semantic search) ===
|
||||
# Detect if question is about ZOPK topics
|
||||
if self._is_zopk_query(current_message):
|
||||
zopk_knowledge = self._get_zopk_knowledge_context(db, current_message)
|
||||
context['zopk_knowledge'] = zopk_knowledge
|
||||
|
||||
# === ETAP 2: Tablica B2B, Kalendarz, Forum ===
|
||||
|
||||
# Add upcoming events (next 60 days)
|
||||
@ -600,6 +617,135 @@ class NordaBizChatEngine:
|
||||
# Extract Company objects from SearchResult
|
||||
return [result.company for result in results]
|
||||
|
||||
def _is_zopk_query(self, message: str) -> bool:
|
||||
"""
|
||||
Check if the message is related to ZOPK (Zielony Okręg Przemysłowy Kaszubia).
|
||||
|
||||
ZOPK topics include:
|
||||
- Offshore wind energy (Baltic Power, Baltica)
|
||||
- Nuclear power plant (Lubiatowo-Kopalino)
|
||||
- Kongsberg investment in Rumia
|
||||
- Infrastructure (Via Pomerania, S6, Droga Czerwona)
|
||||
- Hydrogen, data centers
|
||||
"""
|
||||
zopk_keywords = [
|
||||
# Main project
|
||||
'zopk', 'zielony okręg', 'okręg przemysłowy',
|
||||
# Offshore
|
||||
'offshore', 'farmy wiatrowe', 'energetyka wiatrowa', 'bałtyk', 'baltic power',
|
||||
'baltica', 'orsted', 'morska energia',
|
||||
# Nuclear
|
||||
'elektrownia jądrowa', 'atomowa', 'lubiatowo', 'kopalino', 'pej',
|
||||
# Kongsberg
|
||||
'kongsberg', 'inwestycje norweskie', 'przemysł obronny',
|
||||
# Infrastructure
|
||||
'via pomerania', 'droga czerwona', 's6', 'port gdynia',
|
||||
# Other
|
||||
'wodór', 'centra danych', 'samsonowicz', 'transformacja energetyczna',
|
||||
# Organizations
|
||||
'norda biznes', 'izba przedsiębiorców', 'rumia invest'
|
||||
]
|
||||
|
||||
message_lower = message.lower()
|
||||
return any(kw in message_lower for kw in zopk_keywords)
|
||||
|
||||
def _get_zopk_knowledge_context(self, db, message: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Get ZOPK knowledge base context for the current message.
|
||||
|
||||
Uses semantic search to find relevant:
|
||||
- Knowledge chunks (text fragments with embeddings)
|
||||
- Facts (structured information)
|
||||
- Entities (companies, people, projects)
|
||||
|
||||
Args:
|
||||
db: Database session
|
||||
message: User's question
|
||||
|
||||
Returns:
|
||||
Dict with chunks, facts, entities
|
||||
"""
|
||||
from database import ZOPKKnowledgeEntity, ZOPKKnowledgeChunk, ZOPKNews
|
||||
|
||||
context = {
|
||||
'chunks': [],
|
||||
'facts': [],
|
||||
'entities': []
|
||||
}
|
||||
|
||||
# Check if knowledge service is available
|
||||
if not ZOPK_KNOWLEDGE_AVAILABLE:
|
||||
logger.warning("ZOPK knowledge service not available")
|
||||
return context
|
||||
|
||||
try:
|
||||
# Semantic search in knowledge chunks
|
||||
chunks = search_knowledge(
|
||||
db,
|
||||
query=message,
|
||||
limit=5,
|
||||
min_similarity=0.3,
|
||||
user_id=None # Don't track cost for context building
|
||||
)
|
||||
|
||||
# Enrich chunks with source information
|
||||
for c in chunks:
|
||||
chunk_data = {
|
||||
'content': c['content'][:400], # Limit length
|
||||
'summary': c.get('summary', ''),
|
||||
'similarity': c.get('similarity', 0),
|
||||
'source': 'nieznane',
|
||||
'date': ''
|
||||
}
|
||||
|
||||
# Get source news info if available
|
||||
if c.get('source_news_id'):
|
||||
news = db.query(ZOPKNews).filter(
|
||||
ZOPKNews.id == c['source_news_id']
|
||||
).first()
|
||||
if news:
|
||||
chunk_data['source'] = news.source_name or news.source_domain or 'nieznane'
|
||||
if news.published_at:
|
||||
chunk_data['date'] = news.published_at.strftime('%Y-%m-%d')
|
||||
|
||||
context['chunks'].append(chunk_data)
|
||||
|
||||
# Get relevant facts
|
||||
facts = get_relevant_facts(db, query=message, limit=5)
|
||||
context['facts'] = [
|
||||
{
|
||||
'fact': f['full_text'],
|
||||
'type': f['fact_type'],
|
||||
'confidence': f.get('confidence_score', 0),
|
||||
'value': f.get('numeric_value'),
|
||||
'unit': f.get('numeric_unit')
|
||||
}
|
||||
for f in facts
|
||||
]
|
||||
|
||||
# Get top mentioned entities (always include for context)
|
||||
top_entities = db.query(ZOPKKnowledgeEntity).filter(
|
||||
ZOPKKnowledgeEntity.mentions_count > 1
|
||||
).order_by(
|
||||
ZOPKKnowledgeEntity.mentions_count.desc()
|
||||
).limit(10).all()
|
||||
|
||||
context['entities'] = [
|
||||
{
|
||||
'name': e.name,
|
||||
'type': e.entity_type,
|
||||
'description': e.short_description or '',
|
||||
'mentions': e.mentions_count
|
||||
}
|
||||
for e in top_entities
|
||||
]
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting ZOPK knowledge context: {e}")
|
||||
# Return empty context on error, don't break chat
|
||||
|
||||
return context
|
||||
|
||||
def _query_ai(
|
||||
self,
|
||||
context: Dict[str, Any],
|
||||
@ -799,6 +945,61 @@ BŁĘDNIE (NIE RÓB - resetuje numerację):
|
||||
system_prompt += json.dumps(context['recent_news'], ensure_ascii=False, indent=None)
|
||||
system_prompt += "\n"
|
||||
|
||||
# Add ZOPK Knowledge Base context (semantic search results)
|
||||
if context.get('zopk_knowledge'):
|
||||
zopk = context['zopk_knowledge']
|
||||
system_prompt += "\n\n🌍 BAZA WIEDZY ZOPK (Zielony Okręg Przemysłowy Kaszubia):\n"
|
||||
system_prompt += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
|
||||
|
||||
# Add knowledge chunks (most relevant excerpts)
|
||||
if zopk.get('chunks'):
|
||||
system_prompt += "\n📄 FRAGMENTY WIEDZY (semantycznie dopasowane):\n"
|
||||
for i, chunk in enumerate(zopk['chunks'][:5], 1):
|
||||
system_prompt += f"\n[{i}] {chunk.get('summary', '')}\n"
|
||||
system_prompt += f" Źródło: {chunk.get('source', 'nieznane')} ({chunk.get('date', '')})\n"
|
||||
if chunk.get('content'):
|
||||
# Skrócona treść (max 300 znaków)
|
||||
content_preview = chunk['content'][:300]
|
||||
if len(chunk['content']) > 300:
|
||||
content_preview += "..."
|
||||
system_prompt += f" Treść: {content_preview}\n"
|
||||
|
||||
# Add verified facts
|
||||
if zopk.get('facts'):
|
||||
system_prompt += "\n📌 ZWERYFIKOWANE FAKTY:\n"
|
||||
for fact in zopk['facts'][:10]:
|
||||
confidence_stars = "★" * int(fact.get('confidence', 0) * 5)
|
||||
system_prompt += f"• {fact.get('fact', '')} [{confidence_stars}]\n"
|
||||
if fact.get('value') and fact.get('unit'):
|
||||
system_prompt += f" Wartość: {fact['value']} {fact['unit']}\n"
|
||||
|
||||
# Add key entities
|
||||
if zopk.get('entities'):
|
||||
system_prompt += "\n🏢 KLUCZOWE PODMIOTY ZOPK:\n"
|
||||
for entity in zopk['entities'][:8]:
|
||||
entity_icon = {
|
||||
'organization': '🏛️',
|
||||
'company': '🏢',
|
||||
'person': '👤',
|
||||
'location': '📍',
|
||||
'project': '🎯',
|
||||
'technology': '⚡'
|
||||
}.get(entity.get('type', ''), '•')
|
||||
system_prompt += f"{entity_icon} {entity.get('name', '')} ({entity.get('type', '')})"
|
||||
if entity.get('description'):
|
||||
system_prompt += f" - {entity['description']}"
|
||||
if entity.get('mentions'):
|
||||
system_prompt += f" [{entity['mentions']} wzmianek]"
|
||||
system_prompt += "\n"
|
||||
|
||||
system_prompt += "\n🎯 ZASADY ODPOWIEDZI O ZOPK:\n"
|
||||
system_prompt += "1. Odpowiadaj na podstawie bazy wiedzy (nie wymyślaj faktów)\n"
|
||||
system_prompt += "2. Cytuj źródła: \"Według [portal] z [data]...\"\n"
|
||||
system_prompt += "3. Podawaj konkretne daty i liczby gdy dostępne\n"
|
||||
system_prompt += "4. Wymieniaj organizacje i osoby zaangażowane\n"
|
||||
system_prompt += "5. Jeśli brak informacji w bazie - powiedz wprost\n"
|
||||
system_prompt += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
|
||||
|
||||
# Add upcoming events (Etap 2)
|
||||
if context.get('upcoming_events'):
|
||||
system_prompt += "\n\n📅 KALENDARZ WYDARZEŃ:\n"
|
||||
|
||||
670
zopk_content_scraper.py
Normal file
670
zopk_content_scraper.py
Normal file
@ -0,0 +1,670 @@
|
||||
"""
|
||||
ZOPK Content Scraper - Pobieranie pełnej treści artykułów dla bazy wiedzy.
|
||||
|
||||
Scraper respektuje robots.txt i stosuje rate limiting.
|
||||
Obsługuje główne polskie portale newsowe.
|
||||
|
||||
Usage:
|
||||
from zopk_content_scraper import ZOPKContentScraper
|
||||
|
||||
scraper = ZOPKContentScraper(db_session)
|
||||
result = scraper.scrape_article(news_id=123)
|
||||
# lub batch:
|
||||
result = scraper.batch_scrape(limit=50)
|
||||
"""
|
||||
|
||||
import re
|
||||
import time
|
||||
import logging
|
||||
import hashlib
|
||||
from datetime import datetime
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
from urllib.parse import urlparse
|
||||
from dataclasses import dataclass
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup, Comment, NavigableString
|
||||
|
||||
from database import ZOPKNews
|
||||
|
||||
# Configure logging
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ============================================================
|
||||
# CONFIGURATION
|
||||
# ============================================================
|
||||
|
||||
# User-Agent identifying the bot
|
||||
USER_AGENT = 'NordaBizBot/1.0 (+https://nordabiznes.pl/bot; kontakt@nordabiznes.pl)'
|
||||
|
||||
# Request timeout in seconds
|
||||
REQUEST_TIMEOUT = 15
|
||||
|
||||
# Maximum content length (chars) to avoid memory issues
|
||||
MAX_CONTENT_LENGTH = 100000 # ~100KB of text
|
||||
|
||||
# Rate limiting: seconds between requests per domain
|
||||
RATE_LIMITS = {
|
||||
'trojmiasto.pl': 2.0,
|
||||
'dziennikbaltycki.pl': 2.0,
|
||||
'nordafm.pl': 1.5,
|
||||
'ttm24.pl': 1.5,
|
||||
'radiogdansk.pl': 1.5,
|
||||
'portalmorski.pl': 1.5,
|
||||
'biznes.pap.pl': 2.0,
|
||||
'default': 3.0
|
||||
}
|
||||
|
||||
# Maximum retry attempts
|
||||
MAX_RETRY_ATTEMPTS = 3
|
||||
|
||||
# ============================================================
|
||||
# CONTENT SELECTORS PER DOMAIN
|
||||
# ============================================================
|
||||
|
||||
# CSS selectors for article content extraction
|
||||
# Order matters - first match wins
|
||||
CONTENT_SELECTORS = {
|
||||
'trojmiasto.pl': [
|
||||
'article.article-content',
|
||||
'div.article-body',
|
||||
'div.article__content',
|
||||
'div[itemprop="articleBody"]',
|
||||
],
|
||||
'dziennikbaltycki.pl': [
|
||||
'div.article-body',
|
||||
'article.article-main',
|
||||
'div[itemprop="articleBody"]',
|
||||
'div.art-content',
|
||||
],
|
||||
'nordafm.pl': [
|
||||
'div.entry-content',
|
||||
'article.post-content',
|
||||
'div.post-body',
|
||||
],
|
||||
'ttm24.pl': [
|
||||
'div.post-content',
|
||||
'article.entry-content',
|
||||
'div.article-content',
|
||||
],
|
||||
'radiogdansk.pl': [
|
||||
'div.article-content',
|
||||
'div.entry-content',
|
||||
'article.post',
|
||||
],
|
||||
'portalmorski.pl': [
|
||||
'div.article-content',
|
||||
'div.entry-content',
|
||||
'article.post-content',
|
||||
],
|
||||
'biznes.pap.pl': [
|
||||
'div.article-content',
|
||||
'div.news-content',
|
||||
'article.content',
|
||||
],
|
||||
'gov.pl': [
|
||||
'div.article-content',
|
||||
'main.main-content',
|
||||
'div.content',
|
||||
],
|
||||
'default': [
|
||||
'article',
|
||||
'div[itemprop="articleBody"]',
|
||||
'div.article-content',
|
||||
'div.article-body',
|
||||
'div.entry-content',
|
||||
'div.post-content',
|
||||
'main.content',
|
||||
'main',
|
||||
]
|
||||
}
|
||||
|
||||
# Elements to remove from content
|
||||
ELEMENTS_TO_REMOVE = [
|
||||
'script', 'style', 'nav', 'header', 'footer', 'aside',
|
||||
'form', 'iframe', 'noscript', 'svg', 'canvas',
|
||||
'.advertisement', '.ad', '.ads', '.advert', '.banner',
|
||||
'.social-share', '.share-buttons', '.sharing',
|
||||
'.related-articles', '.related-posts', '.recommendations',
|
||||
'.comments', '.comment-section', '#comments',
|
||||
'.newsletter', '.subscription', '.subscribe',
|
||||
'.cookie-notice', '.cookie-banner', '.gdpr',
|
||||
'.popup', '.modal', '.overlay',
|
||||
'.sidebar', '.widget', '.navigation',
|
||||
'.breadcrumb', '.breadcrumbs',
|
||||
'.author-bio', '.author-box',
|
||||
'.tags', '.tag-list', '.categories',
|
||||
'.pagination', '.pager',
|
||||
'[data-ad]', '[data-advertisement]',
|
||||
]
|
||||
|
||||
# Domains that are not scrapeable (paywalls, dynamic content, etc.)
|
||||
SKIP_DOMAINS = [
|
||||
'facebook.com',
|
||||
'twitter.com',
|
||||
'x.com',
|
||||
'linkedin.com',
|
||||
'youtube.com',
|
||||
'instagram.com',
|
||||
]
|
||||
|
||||
|
||||
# ============================================================
|
||||
# DATA CLASSES
|
||||
# ============================================================
|
||||
|
||||
@dataclass
|
||||
class ScrapeResult:
|
||||
"""Result of scraping an article."""
|
||||
success: bool
|
||||
content: Optional[str] = None
|
||||
word_count: int = 0
|
||||
error: Optional[str] = None
|
||||
status: str = 'pending' # scraped, failed, skipped
|
||||
|
||||
|
||||
# ============================================================
|
||||
# SCRAPER CLASS
|
||||
# ============================================================
|
||||
|
||||
class ZOPKContentScraper:
|
||||
"""
|
||||
Scraper for ZOPK news article content.
|
||||
|
||||
Features:
|
||||
- Domain-specific content selectors
|
||||
- Rate limiting per domain
|
||||
- HTML cleaning (removes ads, navigation, etc.)
|
||||
- Retry logic with exponential backoff
|
||||
- robots.txt respect (via User-Agent)
|
||||
"""
|
||||
|
||||
def __init__(self, db_session, user_id: Optional[int] = None):
|
||||
"""
|
||||
Initialize scraper.
|
||||
|
||||
Args:
|
||||
db_session: SQLAlchemy database session
|
||||
user_id: Optional user ID for audit logging
|
||||
"""
|
||||
self.db = db_session
|
||||
self.user_id = user_id
|
||||
self._last_request_time: Dict[str, float] = {}
|
||||
self._session = self._create_session()
|
||||
|
||||
def _create_session(self) -> requests.Session:
|
||||
"""Create requests session with proper headers."""
|
||||
session = requests.Session()
|
||||
session.headers.update({
|
||||
'User-Agent': USER_AGENT,
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language': 'pl-PL,pl;q=0.9,en;q=0.8',
|
||||
'Accept-Encoding': 'gzip, deflate',
|
||||
'Connection': 'keep-alive',
|
||||
})
|
||||
return session
|
||||
|
||||
def _get_domain(self, url: str) -> str:
|
||||
"""Extract domain from URL."""
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
domain = parsed.netloc.lower()
|
||||
# Remove www. prefix
|
||||
if domain.startswith('www.'):
|
||||
domain = domain[4:]
|
||||
return domain
|
||||
except Exception:
|
||||
return 'unknown'
|
||||
|
||||
def _get_rate_limit(self, domain: str) -> float:
|
||||
"""Get rate limit for domain."""
|
||||
# Check exact domain first
|
||||
if domain in RATE_LIMITS:
|
||||
return RATE_LIMITS[domain]
|
||||
# Check if domain ends with known domain
|
||||
for known_domain, limit in RATE_LIMITS.items():
|
||||
if domain.endswith(known_domain):
|
||||
return limit
|
||||
return RATE_LIMITS['default']
|
||||
|
||||
def _wait_for_rate_limit(self, domain: str) -> None:
|
||||
"""Wait if needed to respect rate limiting."""
|
||||
limit = self._get_rate_limit(domain)
|
||||
last_time = self._last_request_time.get(domain, 0)
|
||||
elapsed = time.time() - last_time
|
||||
if elapsed < limit:
|
||||
wait_time = limit - elapsed
|
||||
logger.debug(f"Rate limiting: waiting {wait_time:.2f}s for {domain}")
|
||||
time.sleep(wait_time)
|
||||
self._last_request_time[domain] = time.time()
|
||||
|
||||
def _should_skip_domain(self, domain: str) -> bool:
|
||||
"""Check if domain should be skipped."""
|
||||
for skip in SKIP_DOMAINS:
|
||||
if skip in domain:
|
||||
return True
|
||||
return False
|
||||
|
||||
def _get_content_selectors(self, domain: str) -> List[str]:
|
||||
"""Get CSS selectors for domain."""
|
||||
# Check exact domain
|
||||
if domain in CONTENT_SELECTORS:
|
||||
return CONTENT_SELECTORS[domain]
|
||||
# Check if domain ends with known domain
|
||||
for known_domain, selectors in CONTENT_SELECTORS.items():
|
||||
if known_domain != 'default' and domain.endswith(known_domain):
|
||||
return selectors
|
||||
return CONTENT_SELECTORS['default']
|
||||
|
||||
def _fetch_html(self, url: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
"""
|
||||
Fetch HTML content from URL.
|
||||
|
||||
Returns:
|
||||
Tuple of (html_content, error_message)
|
||||
"""
|
||||
domain = self._get_domain(url)
|
||||
|
||||
# Check if domain should be skipped
|
||||
if self._should_skip_domain(domain):
|
||||
return None, f"Domain {domain} is not scrapeable (social media/paywall)"
|
||||
|
||||
# Apply rate limiting
|
||||
self._wait_for_rate_limit(domain)
|
||||
|
||||
try:
|
||||
response = self._session.get(
|
||||
url,
|
||||
timeout=REQUEST_TIMEOUT,
|
||||
allow_redirects=True
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
# Check content type
|
||||
content_type = response.headers.get('Content-Type', '')
|
||||
if 'text/html' not in content_type and 'application/xhtml' not in content_type:
|
||||
return None, f"Not HTML content: {content_type}"
|
||||
|
||||
# Detect encoding
|
||||
response.encoding = response.apparent_encoding or 'utf-8'
|
||||
|
||||
return response.text, None
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
return None, "Request timeout"
|
||||
except requests.exceptions.TooManyRedirects:
|
||||
return None, "Too many redirects"
|
||||
except requests.exceptions.HTTPError as e:
|
||||
return None, f"HTTP error: {e.response.status_code}"
|
||||
except requests.exceptions.ConnectionError:
|
||||
return None, "Connection error"
|
||||
except requests.exceptions.RequestException as e:
|
||||
return None, f"Request error: {str(e)}"
|
||||
|
||||
def _clean_html(self, soup: BeautifulSoup) -> BeautifulSoup:
|
||||
"""Remove unwanted elements from HTML."""
|
||||
# Remove comments
|
||||
for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
|
||||
comment.extract()
|
||||
|
||||
# Remove unwanted elements
|
||||
for selector in ELEMENTS_TO_REMOVE:
|
||||
if selector.startswith('.') or selector.startswith('#') or selector.startswith('['):
|
||||
# CSS selector
|
||||
for element in soup.select(selector):
|
||||
element.decompose()
|
||||
else:
|
||||
# Tag name
|
||||
for element in soup.find_all(selector):
|
||||
element.decompose()
|
||||
|
||||
return soup
|
||||
|
||||
def _extract_content(self, html: str, domain: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
"""
|
||||
Extract article content from HTML.
|
||||
|
||||
Returns:
|
||||
Tuple of (content_text, error_message)
|
||||
"""
|
||||
try:
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
|
||||
# Clean HTML first
|
||||
soup = self._clean_html(soup)
|
||||
|
||||
# Try domain-specific selectors
|
||||
selectors = self._get_content_selectors(domain)
|
||||
content_element = None
|
||||
|
||||
for selector in selectors:
|
||||
content_element = soup.select_one(selector)
|
||||
if content_element:
|
||||
logger.debug(f"Found content with selector: {selector}")
|
||||
break
|
||||
|
||||
if not content_element:
|
||||
# Fallback: try to find largest text block
|
||||
content_element = self._find_largest_text_block(soup)
|
||||
|
||||
if not content_element:
|
||||
return None, "Could not find article content"
|
||||
|
||||
# Extract text
|
||||
text = self._extract_text(content_element)
|
||||
|
||||
if not text or len(text) < 100:
|
||||
return None, "Extracted content too short"
|
||||
|
||||
# Truncate if too long
|
||||
if len(text) > MAX_CONTENT_LENGTH:
|
||||
text = text[:MAX_CONTENT_LENGTH] + "..."
|
||||
logger.warning(f"Content truncated to {MAX_CONTENT_LENGTH} chars")
|
||||
|
||||
return text, None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error extracting content: {e}")
|
||||
return None, f"Extraction error: {str(e)}"
|
||||
|
||||
def _find_largest_text_block(self, soup: BeautifulSoup) -> Optional[BeautifulSoup]:
|
||||
"""Find the largest text block in the page (fallback method)."""
|
||||
candidates = soup.find_all(['article', 'main', 'div', 'section'])
|
||||
|
||||
best_element = None
|
||||
best_score = 0
|
||||
|
||||
for element in candidates:
|
||||
# Skip small elements
|
||||
text = element.get_text(strip=True)
|
||||
if len(text) < 200:
|
||||
continue
|
||||
|
||||
# Calculate score based on text density and paragraph count
|
||||
paragraphs = len(element.find_all('p'))
|
||||
text_length = len(text)
|
||||
|
||||
# Prefer elements with many paragraphs
|
||||
score = text_length + (paragraphs * 100)
|
||||
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
best_element = element
|
||||
|
||||
return best_element
|
||||
|
||||
def _extract_text(self, element: BeautifulSoup) -> str:
|
||||
"""Extract clean text from element."""
|
||||
# Get text with proper spacing
|
||||
lines = []
|
||||
|
||||
for child in element.descendants:
|
||||
if isinstance(child, NavigableString):
|
||||
text = str(child).strip()
|
||||
if text:
|
||||
lines.append(text)
|
||||
elif child.name in ['br', 'p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']:
|
||||
lines.append('\n')
|
||||
|
||||
# Join and clean
|
||||
text = ' '.join(lines)
|
||||
|
||||
# Clean up whitespace
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
text = re.sub(r'\n\s*\n', '\n\n', text)
|
||||
text = text.strip()
|
||||
|
||||
return text
|
||||
|
||||
def _count_words(self, text: str) -> int:
|
||||
"""Count words in text."""
|
||||
if not text:
|
||||
return 0
|
||||
words = re.findall(r'\b\w+\b', text)
|
||||
return len(words)
|
||||
|
||||
def scrape_article(self, news_id: int) -> ScrapeResult:
|
||||
"""
|
||||
Scrape content for a single article.
|
||||
|
||||
Args:
|
||||
news_id: ID of ZOPKNews record
|
||||
|
||||
Returns:
|
||||
ScrapeResult with content or error
|
||||
"""
|
||||
# Get news record
|
||||
news = self.db.query(ZOPKNews).filter(ZOPKNews.id == news_id).first()
|
||||
|
||||
if not news:
|
||||
return ScrapeResult(
|
||||
success=False,
|
||||
error=f"News record {news_id} not found",
|
||||
status='failed'
|
||||
)
|
||||
|
||||
# Check if already scraped
|
||||
if news.scrape_status == 'scraped' and news.full_content:
|
||||
return ScrapeResult(
|
||||
success=True,
|
||||
content=news.full_content,
|
||||
word_count=news.content_word_count or 0,
|
||||
status='scraped'
|
||||
)
|
||||
|
||||
url = news.url
|
||||
domain = self._get_domain(url)
|
||||
|
||||
logger.info(f"Scraping article {news_id}: {url}")
|
||||
|
||||
# Check if should skip
|
||||
if self._should_skip_domain(domain):
|
||||
news.scrape_status = 'skipped'
|
||||
news.scrape_error = f"Domain {domain} not scrapeable"
|
||||
self.db.commit()
|
||||
return ScrapeResult(
|
||||
success=False,
|
||||
error=f"Domain {domain} not scrapeable",
|
||||
status='skipped'
|
||||
)
|
||||
|
||||
# Fetch HTML
|
||||
html, fetch_error = self._fetch_html(url)
|
||||
|
||||
if fetch_error:
|
||||
news.scrape_status = 'failed'
|
||||
news.scrape_error = fetch_error
|
||||
news.scrape_attempts = (news.scrape_attempts or 0) + 1
|
||||
self.db.commit()
|
||||
return ScrapeResult(
|
||||
success=False,
|
||||
error=fetch_error,
|
||||
status='failed'
|
||||
)
|
||||
|
||||
# Extract content
|
||||
content, extract_error = self._extract_content(html, domain)
|
||||
|
||||
if extract_error:
|
||||
news.scrape_status = 'failed'
|
||||
news.scrape_error = extract_error
|
||||
news.scrape_attempts = (news.scrape_attempts or 0) + 1
|
||||
self.db.commit()
|
||||
return ScrapeResult(
|
||||
success=False,
|
||||
error=extract_error,
|
||||
status='failed'
|
||||
)
|
||||
|
||||
# Success - update database
|
||||
word_count = self._count_words(content)
|
||||
|
||||
news.full_content = content
|
||||
news.content_word_count = word_count
|
||||
news.content_scraped_at = datetime.now()
|
||||
news.scrape_status = 'scraped'
|
||||
news.scrape_error = None
|
||||
news.scrape_attempts = (news.scrape_attempts or 0) + 1
|
||||
|
||||
self.db.commit()
|
||||
|
||||
logger.info(f"Successfully scraped article {news_id}: {word_count} words")
|
||||
|
||||
return ScrapeResult(
|
||||
success=True,
|
||||
content=content,
|
||||
word_count=word_count,
|
||||
status='scraped'
|
||||
)
|
||||
|
||||
def batch_scrape(
|
||||
self,
|
||||
limit: int = 50,
|
||||
status_filter: Optional[str] = None,
|
||||
force: bool = False
|
||||
) -> Dict:
|
||||
"""
|
||||
Batch scrape articles.
|
||||
|
||||
Args:
|
||||
limit: Maximum number of articles to scrape
|
||||
status_filter: Filter by approval status (approved, auto_approved)
|
||||
force: If True, re-scrape even already scraped articles
|
||||
|
||||
Returns:
|
||||
Dict with statistics
|
||||
"""
|
||||
logger.info(f"Starting batch scrape: limit={limit}, force={force}")
|
||||
|
||||
# Build query
|
||||
query = self.db.query(ZOPKNews)
|
||||
|
||||
# Filter by approval status
|
||||
if status_filter:
|
||||
query = query.filter(ZOPKNews.status == status_filter)
|
||||
else:
|
||||
# Default: only approved/auto_approved articles
|
||||
query = query.filter(ZOPKNews.status.in_(['approved', 'auto_approved']))
|
||||
|
||||
# Filter by scrape status
|
||||
if not force:
|
||||
query = query.filter(ZOPKNews.scrape_status.in_(['pending', 'failed']))
|
||||
# Limit retry attempts for failed
|
||||
query = query.filter(
|
||||
(ZOPKNews.scrape_status == 'pending') |
|
||||
((ZOPKNews.scrape_status == 'failed') & (ZOPKNews.scrape_attempts < MAX_RETRY_ATTEMPTS))
|
||||
)
|
||||
|
||||
# Order by creation date (newest first)
|
||||
query = query.order_by(ZOPKNews.created_at.desc())
|
||||
|
||||
# Limit
|
||||
articles = query.limit(limit).all()
|
||||
|
||||
# Statistics
|
||||
stats = {
|
||||
'total': len(articles),
|
||||
'scraped': 0,
|
||||
'failed': 0,
|
||||
'skipped': 0,
|
||||
'errors': [],
|
||||
'scraped_articles': [],
|
||||
'processing_time': 0
|
||||
}
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
for article in articles:
|
||||
result = self.scrape_article(article.id)
|
||||
|
||||
if result.status == 'scraped':
|
||||
stats['scraped'] += 1
|
||||
stats['scraped_articles'].append({
|
||||
'id': article.id,
|
||||
'title': article.title[:100],
|
||||
'word_count': result.word_count,
|
||||
'source': article.source_name
|
||||
})
|
||||
elif result.status == 'skipped':
|
||||
stats['skipped'] += 1
|
||||
else:
|
||||
stats['failed'] += 1
|
||||
stats['errors'].append({
|
||||
'id': article.id,
|
||||
'url': article.url,
|
||||
'error': result.error
|
||||
})
|
||||
|
||||
stats['processing_time'] = round(time.time() - start_time, 2)
|
||||
|
||||
logger.info(
|
||||
f"Batch scrape complete: {stats['scraped']} scraped, "
|
||||
f"{stats['failed']} failed, {stats['skipped']} skipped "
|
||||
f"in {stats['processing_time']}s"
|
||||
)
|
||||
|
||||
return stats
|
||||
|
||||
def get_scrape_statistics(self) -> Dict:
|
||||
"""Get scraping statistics."""
|
||||
from sqlalchemy import func
|
||||
|
||||
# Count by scrape_status
|
||||
status_counts = self.db.query(
|
||||
ZOPKNews.scrape_status,
|
||||
func.count(ZOPKNews.id)
|
||||
).filter(
|
||||
ZOPKNews.status.in_(['approved', 'auto_approved'])
|
||||
).group_by(ZOPKNews.scrape_status).all()
|
||||
|
||||
status_dict = {status: count for status, count in status_counts}
|
||||
|
||||
# Total approved articles
|
||||
total_approved = self.db.query(func.count(ZOPKNews.id)).filter(
|
||||
ZOPKNews.status.in_(['approved', 'auto_approved'])
|
||||
).scalar()
|
||||
|
||||
# Articles ready for knowledge extraction
|
||||
ready_for_extraction = self.db.query(func.count(ZOPKNews.id)).filter(
|
||||
ZOPKNews.scrape_status == 'scraped',
|
||||
ZOPKNews.knowledge_extracted == False
|
||||
).scalar()
|
||||
|
||||
# Average word count
|
||||
avg_word_count = self.db.query(func.avg(ZOPKNews.content_word_count)).filter(
|
||||
ZOPKNews.scrape_status == 'scraped'
|
||||
).scalar()
|
||||
|
||||
return {
|
||||
'total_approved': total_approved or 0,
|
||||
'scraped': status_dict.get('scraped', 0),
|
||||
'pending': status_dict.get('pending', 0) + status_dict.get(None, 0),
|
||||
'failed': status_dict.get('failed', 0),
|
||||
'skipped': status_dict.get('skipped', 0),
|
||||
'ready_for_extraction': ready_for_extraction or 0,
|
||||
'avg_word_count': round(avg_word_count or 0, 0)
|
||||
}
|
||||
|
||||
|
||||
# ============================================================
|
||||
# STANDALONE FUNCTIONS FOR CRON/CLI
|
||||
# ============================================================
|
||||
|
||||
def scrape_pending_articles(db_session, limit: int = 50) -> Dict:
|
||||
"""
|
||||
Convenience function for cron jobs.
|
||||
|
||||
Usage:
|
||||
from zopk_content_scraper import scrape_pending_articles
|
||||
result = scrape_pending_articles(db_session, limit=50)
|
||||
"""
|
||||
scraper = ZOPKContentScraper(db_session)
|
||||
return scraper.batch_scrape(limit=limit)
|
||||
|
||||
|
||||
def get_scrape_stats(db_session) -> Dict:
|
||||
"""
|
||||
Get scraping statistics for monitoring.
|
||||
"""
|
||||
scraper = ZOPKContentScraper(db_session)
|
||||
return scraper.get_scrape_statistics()
|
||||
1039
zopk_knowledge_service.py
Normal file
1039
zopk_knowledge_service.py
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user