feat(zopk): Knowledge Base + NordaGPT integration (FAZY 0-3)

FAZA 0 - Web Scraping: - Migracja 015: pola full_content, scrape_status w zopk_news - zopk_content_scraper.py: scraper z rate limiting i selektorami FAZA 1 - Knowledge Extraction: - zopk_knowledge_service.py: chunking, facts, entities extraction - Endpointy /admin/zopk/knowledge/extract FAZA 2 - Embeddings: - gemini_service.py: generate_embedding(), generate_embeddings_batch() - Model text-embedding-004 (768 dimensions) FAZA 3 - NordaGPT Integration: - nordabiz_chat.py: _is_zopk_query(), _get_zopk_knowledge_context() - System prompt z bazą wiedzy ZOPK - Semantic search w kontekście chatu Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-16 20:15:30 +01:00 · 2026-01-16 20:15:30 +01:00 · 1b4cd31c41
commit 1b4cd31c41
parent 0f1cf6176a
7 changed files with 2427 additions and 1 deletions
--- a/app.py
+++ b/app.py
@ -10849,6 +10849,331 @@ def api_zopk_search_news():
        db.close()


+# ============================================================
+# ZOPK CONTENT SCRAPING (Knowledge Base Pipeline)
+# ============================================================
+
+@app.route('/admin/zopk/news/scrape-stats')
+@login_required
+def admin_zopk_scrape_stats():
+    """
+    Get content scraping statistics.
+
+    Returns JSON with:
+    - total_approved: Total approved/auto_approved articles
+    - scraped: Successfully scraped articles
+    - pending: Articles waiting to be scraped
+    - failed: Failed scraping attempts
+    - skipped: Skipped (social media, paywalls)
+    - ready_for_extraction: Scraped but not yet processed for knowledge
+    """
+    if not current_user.is_admin:
+        return jsonify({'success': False, 'error': 'Brak uprawnień'}), 403
+
+    from zopk_content_scraper import get_scrape_stats
+
+    db = SessionLocal()
+    try:
+        stats = get_scrape_stats(db)
+        return jsonify({
+            'success': True,
+            **stats
+        })
+    except Exception as e:
+        logger.error(f"Error getting scrape stats: {e}")
+        return jsonify({'success': False, 'error': str(e)}), 500
+    finally:
+        db.close()
+
+
+@app.route('/admin/zopk/news/scrape-content', methods=['POST'])
+@login_required
+def admin_zopk_scrape_content():
+    """
+    Batch scrape article content from source URLs.
+
+    Request JSON:
+    - limit: int (default 50) - max articles to scrape
+    - force: bool (default false) - re-scrape already scraped
+
+    Response:
+    - scraped: number of successfully scraped
+    - failed: number of failures
+    - skipped: number of skipped (social media, etc.)
+    - errors: list of error details
+    - scraped_articles: list of scraped article info
+    """
+    if not current_user.is_admin:
+        return jsonify({'success': False, 'error': 'Brak uprawnień'}), 403
+
+    from zopk_content_scraper import ZOPKContentScraper
+
+    db = SessionLocal()
+    try:
+        data = request.get_json() or {}
+        limit = min(data.get('limit', 50), 100)  # Max 100 at once
+        force = data.get('force', False)
+
+        scraper = ZOPKContentScraper(db, user_id=current_user.id)
+        result = scraper.batch_scrape(limit=limit, force=force)
+
+        return jsonify({
+            'success': True,
+            'message': f"Scraping zakończony: {result['scraped']} pobrano, "
+                      f"{result['failed']} błędów, {result['skipped']} pominięto",
+            **result
+        })
+
+    except Exception as e:
+        db.rollback()
+        logger.error(f"Error in batch scrape: {e}")
+        return jsonify({'success': False, 'error': str(e)}), 500
+    finally:
+        db.close()
+
+
+@app.route('/admin/zopk/news/<int:news_id>/scrape', methods=['POST'])
+@login_required
+def admin_zopk_scrape_single(news_id):
+    """
+    Scrape content for a single article.
+    """
+    if not current_user.is_admin:
+        return jsonify({'success': False, 'error': 'Brak uprawnień'}), 403
+
+    from zopk_content_scraper import ZOPKContentScraper
+
+    db = SessionLocal()
+    try:
+        scraper = ZOPKContentScraper(db, user_id=current_user.id)
+        result = scraper.scrape_article(news_id)
+
+        if result.success:
+            return jsonify({
+                'success': True,
+                'message': f"Pobrano treść: {result.word_count} słów",
+                'word_count': result.word_count,
+                'status': result.status
+            })
+        else:
+            return jsonify({
+                'success': False,
+                'error': result.error,
+                'status': result.status
+            }), 400
+
+    except Exception as e:
+        db.rollback()
+        logger.error(f"Error scraping article {news_id}: {e}")
+        return jsonify({'success': False, 'error': str(e)}), 500
+    finally:
+        db.close()
+
+
+# ============================================================
+# ZOPK KNOWLEDGE EXTRACTION (AI-powered)
+# ============================================================
+
+@app.route('/admin/zopk/knowledge/stats')
+@login_required
+def admin_zopk_knowledge_stats():
+    """
+    Get knowledge extraction statistics.
+
+    Returns:
+    - articles: stats about articles (approved, scraped, extracted)
+    - knowledge_base: stats about chunks, facts, entities, relations
+    - top_entities: most mentioned entities
+    """
+    if not current_user.is_admin:
+        return jsonify({'success': False, 'error': 'Brak uprawnień'}), 403
+
+    from zopk_knowledge_service import get_knowledge_stats
+
+    db = SessionLocal()
+    try:
+        stats = get_knowledge_stats(db)
+        return jsonify({
+            'success': True,
+            **stats
+        })
+    except Exception as e:
+        logger.error(f"Error getting knowledge stats: {e}")
+        return jsonify({'success': False, 'error': str(e)}), 500
+    finally:
+        db.close()
+
+
+@app.route('/admin/zopk/knowledge/extract', methods=['POST'])
+@login_required
+def admin_zopk_knowledge_extract():
+    """
+    Batch extract knowledge from scraped articles.
+
+    Request JSON:
+    - limit: int (default 50) - max articles to process
+
+    Response:
+    - success/failed counts
+    - chunks/facts/entities/relations created
+    - errors list
+    """
+    if not current_user.is_admin:
+        return jsonify({'success': False, 'error': 'Brak uprawnień'}), 403
+
+    from zopk_knowledge_service import ZOPKKnowledgeService
+
+    db = SessionLocal()
+    try:
+        data = request.get_json() or {}
+        limit = min(data.get('limit', 50), 100)
+
+        service = ZOPKKnowledgeService(db, user_id=current_user.id)
+        result = service.batch_extract(limit=limit)
+
+        return jsonify({
+            'success': True,
+            'message': f"Ekstrakcja zakończona: {result['success']}/{result['total']} artykułów. "
+                      f"Utworzono: {result['chunks_created']} chunks, {result['facts_created']} faktów, "
+                      f"{result['entities_created']} encji, {result['relations_created']} relacji.",
+            **result
+        })
+
+    except Exception as e:
+        db.rollback()
+        logger.error(f"Error in knowledge extraction: {e}")
+        return jsonify({'success': False, 'error': str(e)}), 500
+    finally:
+        db.close()
+
+
+@app.route('/admin/zopk/knowledge/extract/<int:news_id>', methods=['POST'])
+@login_required
+def admin_zopk_knowledge_extract_single(news_id):
+    """
+    Extract knowledge from a single article.
+    """
+    if not current_user.is_admin:
+        return jsonify({'success': False, 'error': 'Brak uprawnień'}), 403
+
+    from zopk_knowledge_service import ZOPKKnowledgeService
+
+    db = SessionLocal()
+    try:
+        service = ZOPKKnowledgeService(db, user_id=current_user.id)
+        result = service.extract_from_news(news_id)
+
+        if result.success:
+            return jsonify({
+                'success': True,
+                'message': f"Wyekstrahowano: {result.chunks_created} chunks, "
+                          f"{result.facts_created} faktów, {result.entities_created} encji",
+                'chunks_created': result.chunks_created,
+                'facts_created': result.facts_created,
+                'entities_created': result.entities_created,
+                'relations_created': result.relations_created,
+                'processing_time': result.processing_time
+            })
+        else:
+            return jsonify({
+                'success': False,
+                'error': result.error
+            }), 400
+
+    except Exception as e:
+        db.rollback()
+        logger.error(f"Error extracting from news {news_id}: {e}")
+        return jsonify({'success': False, 'error': str(e)}), 500
+    finally:
+        db.close()
+
+
+@app.route('/admin/zopk/knowledge/embeddings', methods=['POST'])
+@login_required
+def admin_zopk_generate_embeddings():
+    """
+    Generate embeddings for chunks that don't have them.
+
+    Request JSON:
+    - limit: int (default 100) - max chunks to process
+    """
+    if not current_user.is_admin:
+        return jsonify({'success': False, 'error': 'Brak uprawnień'}), 403
+
+    from zopk_knowledge_service import generate_chunk_embeddings
+
+    db = SessionLocal()
+    try:
+        data = request.get_json() or {}
+        limit = min(data.get('limit', 100), 500)
+
+        result = generate_chunk_embeddings(db, limit=limit, user_id=current_user.id)
+
+        return jsonify({
+            'success': True,
+            'message': f"Wygenerowano embeddings: {result['success']}/{result['total']}",
+            **result
+        })
+
+    except Exception as e:
+        db.rollback()
+        logger.error(f"Error generating embeddings: {e}")
+        return jsonify({'success': False, 'error': str(e)}), 500
+    finally:
+        db.close()
+
+
+@app.route('/api/zopk/knowledge/search', methods=['POST'])
+@login_required
+def api_zopk_knowledge_search():
+    """
+    Semantic search in ZOPK knowledge base.
+
+    Request JSON:
+    - query: str (required) - search query
+    - limit: int (default 5) - max results
+
+    Response:
+    - chunks: list of matching knowledge chunks with similarity scores
+    - facts: list of relevant facts
+    """
+    from zopk_knowledge_service import search_knowledge, get_relevant_facts
+
+    db = SessionLocal()
+    try:
+        data = request.get_json() or {}
+        query = data.get('query', '')
+
+        if not query:
+            return jsonify({'success': False, 'error': 'Query wymagane'}), 400
+
+        limit = min(data.get('limit', 5), 20)
+
+        # Search chunks
+        chunks = search_knowledge(
+            db,
+            query=query,
+            limit=limit,
+            user_id=current_user.id
+        )
+
+        # Get relevant facts
+        facts = get_relevant_facts(db, query=query, limit=limit)
+
+        return jsonify({
+            'success': True,
+            'query': query,
+            'chunks': chunks,
+            'facts': facts
+        })
+
+    except Exception as e:
+        logger.error(f"Error in knowledge search: {e}")
+        return jsonify({'success': False, 'error': str(e)}), 500
+    finally:
+        db.close()
+
+
 # ============================================================
 # KRS AUDIT (Krajowy Rejestr Sądowy)
 # ============================================================
--- a/database.py
+++ b/database.py
@ -1902,6 +1902,19 @@ class ZOPKNews(Base):
    is_featured = Column(Boolean, default=False)
    views_count = Column(Integer, default=0)

+    # Full content (scraped from source URL) - for knowledge extraction
+    full_content = Column(Text)  # Full article text (without HTML, ads, navigation)
+    content_scraped_at = Column(DateTime)  # When content was scraped
+    scrape_status = Column(String(20), default='pending', index=True)  # pending, scraped, failed, skipped
+    scrape_error = Column(Text)  # Error message if scraping failed
+    scrape_attempts = Column(Integer, default=0)  # Number of scraping attempts
+    content_word_count = Column(Integer)  # Word count of scraped content
+    content_language = Column(String(10), default='pl')  # pl, en
+
+    # Knowledge extraction status
+    knowledge_extracted = Column(Boolean, default=False, index=True)  # True if chunks/facts/entities extracted
+    knowledge_extracted_at = Column(DateTime)  # When knowledge was extracted
+
    created_at = Column(DateTime, default=datetime.now)
    updated_at = Column(DateTime, default=datetime.now, onupdate=datetime.now)

--- a/database/migrations/015_zopk_full_content.sql
+++ b/database/migrations/015_zopk_full_content.sql
@ -0,0 +1,58 @@
+-- Migration 015: Add full_content fields to zopk_news for knowledge base extraction
+-- Date: 2026-01-16
+-- Purpose: Store scraped article content for AI knowledge extraction
+
+-- ============================================================
+-- ADD NEW COLUMNS TO zopk_news
+-- ============================================================
+
+-- Full article content (scraped from source URL)
+ALTER TABLE zopk_news ADD COLUMN IF NOT EXISTS full_content TEXT;
+
+-- Content scraping metadata
+ALTER TABLE zopk_news ADD COLUMN IF NOT EXISTS content_scraped_at TIMESTAMP;
+ALTER TABLE zopk_news ADD COLUMN IF NOT EXISTS scrape_status VARCHAR(20) DEFAULT 'pending';
+-- Status values: pending, scraped, failed, skipped
+
+-- Scraping error tracking
+ALTER TABLE zopk_news ADD COLUMN IF NOT EXISTS scrape_error TEXT;
+ALTER TABLE zopk_news ADD COLUMN IF NOT EXISTS scrape_attempts INTEGER DEFAULT 0;
+
+-- Content metadata (extracted during scraping)
+ALTER TABLE zopk_news ADD COLUMN IF NOT EXISTS content_word_count INTEGER;
+ALTER TABLE zopk_news ADD COLUMN IF NOT EXISTS content_language VARCHAR(10) DEFAULT 'pl';
+
+-- Knowledge extraction status
+ALTER TABLE zopk_news ADD COLUMN IF NOT EXISTS knowledge_extracted BOOLEAN DEFAULT FALSE;
+ALTER TABLE zopk_news ADD COLUMN IF NOT EXISTS knowledge_extracted_at TIMESTAMP;
+
+-- ============================================================
+-- INDEXES FOR EFFICIENT QUERYING
+-- ============================================================
+
+-- Index for finding articles to scrape
+CREATE INDEX IF NOT EXISTS idx_zopk_news_scrape_status ON zopk_news(scrape_status);
+
+-- Index for finding articles ready for knowledge extraction
+CREATE INDEX IF NOT EXISTS idx_zopk_news_knowledge_extracted ON zopk_news(knowledge_extracted);
+
+-- Composite index for scraping pipeline
+CREATE INDEX IF NOT EXISTS idx_zopk_news_scrape_pipeline
+ON zopk_news(status, scrape_status, knowledge_extracted);
+
+-- ============================================================
+-- COMMENTS
+-- ============================================================
+
+COMMENT ON COLUMN zopk_news.full_content IS 'Full article text scraped from source URL (without HTML, ads, navigation)';
+COMMENT ON COLUMN zopk_news.scrape_status IS 'pending=not scraped, scraped=success, failed=error, skipped=not scrapeable';
+COMMENT ON COLUMN zopk_news.scrape_error IS 'Error message if scraping failed';
+COMMENT ON COLUMN zopk_news.scrape_attempts IS 'Number of scraping attempts (for retry logic)';
+COMMENT ON COLUMN zopk_news.content_word_count IS 'Word count of scraped content';
+COMMENT ON COLUMN zopk_news.knowledge_extracted IS 'True if chunks/facts/entities extracted';
+
+-- ============================================================
+-- GRANT PERMISSIONS
+-- ============================================================
+
+GRANT ALL ON TABLE zopk_news TO nordabiz_app;
--- a/gemini_service.py
+++ b/gemini_service.py
@ -404,6 +404,126 @@ class GeminiService:
        except Exception as e:
            logger.error(f"Failed to log API cost: {e}")

+    def generate_embedding(
+        self,
+        text: str,
+        task_type: str = 'retrieval_document',
+        title: Optional[str] = None,
+        user_id: Optional[int] = None,
+        feature: str = 'embedding'
+    ) -> Optional[List[float]]:
+        """
+        Generate embedding vector for text using Google's text-embedding model.
+
+        Args:
+            text: Text to embed
+            task_type: One of:
+                - 'retrieval_document': For documents to be retrieved
+                - 'retrieval_query': For search queries
+                - 'semantic_similarity': For comparing texts
+                - 'classification': For text classification
+                - 'clustering': For text clustering
+            title: Optional title for document (improves quality)
+            user_id: User ID for cost tracking
+            feature: Feature name for cost tracking
+
+        Returns:
+            768-dimensional embedding vector or None on error
+
+        Cost: ~$0.00001 per 1K tokens (very cheap)
+        """
+        if not text or not text.strip():
+            logger.warning("Empty text provided for embedding")
+            return None
+
+        start_time = time.time()
+
+        try:
+            # Use text-embedding-004 model (768 dimensions)
+            # This is Google's recommended model for embeddings
+            result = genai.embed_content(
+                model='models/text-embedding-004',
+                content=text,
+                task_type=task_type,
+                title=title
+            )
+
+            embedding = result.get('embedding')
+
+            if not embedding:
+                logger.error("No embedding returned from API")
+                return None
+
+            # Log cost (embedding API is very cheap)
+            latency_ms = int((time.time() - start_time) * 1000)
+            token_count = len(text) // 4  # Approximate
+
+            # Embedding pricing: ~$0.00001 per 1K tokens
+            cost_usd = (token_count / 1000) * 0.00001
+
+            logger.debug(
+                f"Embedding generated: {len(embedding)} dims, "
+                f"{token_count} tokens, {latency_ms}ms, ${cost_usd:.8f}"
+            )
+
+            # Log to database (if cost tracking is important)
+            if DB_AVAILABLE and user_id:
+                try:
+                    db = SessionLocal()
+                    try:
+                        usage_log = AIUsageLog(
+                            request_type=feature,
+                            model='text-embedding-004',
+                            tokens_input=token_count,
+                            tokens_output=0,
+                            cost_cents=cost_usd * 100,
+                            user_id=user_id,
+                            prompt_length=len(text),
+                            response_length=len(embedding) * 4,  # 4 bytes per float
+                            response_time_ms=latency_ms,
+                            success=True
+                        )
+                        db.add(usage_log)
+                        db.commit()
+                    finally:
+                        db.close()
+                except Exception as e:
+                    logger.error(f"Failed to log embedding cost: {e}")
+
+            return embedding
+
+        except Exception as e:
+            logger.error(f"Embedding generation error: {e}")
+            return None
+
+    def generate_embeddings_batch(
+        self,
+        texts: List[str],
+        task_type: str = 'retrieval_document',
+        user_id: Optional[int] = None
+    ) -> List[Optional[List[float]]]:
+        """
+        Generate embeddings for multiple texts.
+
+        Args:
+            texts: List of texts to embed
+            task_type: Task type for all embeddings
+            user_id: User ID for cost tracking
+
+        Returns:
+            List of embedding vectors (None for failed items)
+        """
+        results = []
+        for text in texts:
+            embedding = self.generate_embedding(
+                text=text,
+                task_type=task_type,
+                user_id=user_id,
+                feature='embedding_batch'
+            )
+            results.append(embedding)
+        return results
+

 # Global service instance (initialized in app.py)
 _gemini_service: Optional[GeminiService] = None
--- a/nordabiz_chat.py
+++ b/nordabiz_chat.py
@ -18,12 +18,16 @@ Created: 2025-11-23

 import os
 import time
+import logging
 from datetime import datetime
 from typing import Dict, List, Any, Optional
 import google.generativeai as genai
 import gemini_service
 from search_service import search_companies

+# Module logger
+logger = logging.getLogger(__name__)
+
 from database import (
    SessionLocal,
    Company,
@ -58,6 +62,13 @@ try:
 except ImportError:
    FEEDBACK_LEARNING_AVAILABLE = False

+# Import ZOPK knowledge service for semantic search
+try:
+    from zopk_knowledge_service import search_knowledge, get_relevant_facts
+    ZOPK_KNOWLEDGE_AVAILABLE = True
+except ImportError:
+    ZOPK_KNOWLEDGE_AVAILABLE = False
+

 class NordaBizChatEngine:
    """
@ -347,7 +358,7 @@ class NordaBizChatEngine:
        from datetime import timedelta
        news_cutoff = datetime.now() - timedelta(days=30)
        recent_news = db.query(ZOPKNews).filter(
-            ZOPKNews.status == 'approved',
+            ZOPKNews.status.in_(['approved', 'auto_approved']),
            ZOPKNews.published_at >= news_cutoff
        ).order_by(ZOPKNews.published_at.desc()).limit(10).all()

@ -361,6 +372,12 @@ class NordaBizChatEngine:
            for news in recent_news
        ]

+        # === ZOPK KNOWLEDGE BASE (semantic search) ===
+        # Detect if question is about ZOPK topics
+        if self._is_zopk_query(current_message):
+            zopk_knowledge = self._get_zopk_knowledge_context(db, current_message)
+            context['zopk_knowledge'] = zopk_knowledge
+
        # === ETAP 2: Tablica B2B, Kalendarz, Forum ===

        # Add upcoming events (next 60 days)
@ -600,6 +617,135 @@ class NordaBizChatEngine:
        # Extract Company objects from SearchResult
        return [result.company for result in results]

+    def _is_zopk_query(self, message: str) -> bool:
+        """
+        Check if the message is related to ZOPK (Zielony Okręg Przemysłowy Kaszubia).
+
+        ZOPK topics include:
+        - Offshore wind energy (Baltic Power, Baltica)
+        - Nuclear power plant (Lubiatowo-Kopalino)
+        - Kongsberg investment in Rumia
+        - Infrastructure (Via Pomerania, S6, Droga Czerwona)
+        - Hydrogen, data centers
+        """
+        zopk_keywords = [
+            # Main project
+            'zopk', 'zielony okręg', 'okręg przemysłowy',
+            # Offshore
+            'offshore', 'farmy wiatrowe', 'energetyka wiatrowa', 'bałtyk', 'baltic power',
+            'baltica', 'orsted', 'morska energia',
+            # Nuclear
+            'elektrownia jądrowa', 'atomowa', 'lubiatowo', 'kopalino', 'pej',
+            # Kongsberg
+            'kongsberg', 'inwestycje norweskie', 'przemysł obronny',
+            # Infrastructure
+            'via pomerania', 'droga czerwona', 's6', 'port gdynia',
+            # Other
+            'wodór', 'centra danych', 'samsonowicz', 'transformacja energetyczna',
+            # Organizations
+            'norda biznes', 'izba przedsiębiorców', 'rumia invest'
+        ]
+
+        message_lower = message.lower()
+        return any(kw in message_lower for kw in zopk_keywords)
+
+    def _get_zopk_knowledge_context(self, db, message: str) -> Dict[str, Any]:
+        """
+        Get ZOPK knowledge base context for the current message.
+
+        Uses semantic search to find relevant:
+        - Knowledge chunks (text fragments with embeddings)
+        - Facts (structured information)
+        - Entities (companies, people, projects)
+
+        Args:
+            db: Database session
+            message: User's question
+
+        Returns:
+            Dict with chunks, facts, entities
+        """
+        from database import ZOPKKnowledgeEntity, ZOPKKnowledgeChunk, ZOPKNews
+
+        context = {
+            'chunks': [],
+            'facts': [],
+            'entities': []
+        }
+
+        # Check if knowledge service is available
+        if not ZOPK_KNOWLEDGE_AVAILABLE:
+            logger.warning("ZOPK knowledge service not available")
+            return context
+
+        try:
+            # Semantic search in knowledge chunks
+            chunks = search_knowledge(
+                db,
+                query=message,
+                limit=5,
+                min_similarity=0.3,
+                user_id=None  # Don't track cost for context building
+            )
+
+            # Enrich chunks with source information
+            for c in chunks:
+                chunk_data = {
+                    'content': c['content'][:400],  # Limit length
+                    'summary': c.get('summary', ''),
+                    'similarity': c.get('similarity', 0),
+                    'source': 'nieznane',
+                    'date': ''
+                }
+
+                # Get source news info if available
+                if c.get('source_news_id'):
+                    news = db.query(ZOPKNews).filter(
+                        ZOPKNews.id == c['source_news_id']
+                    ).first()
+                    if news:
+                        chunk_data['source'] = news.source_name or news.source_domain or 'nieznane'
+                        if news.published_at:
+                            chunk_data['date'] = news.published_at.strftime('%Y-%m-%d')
+
+                context['chunks'].append(chunk_data)
+
+            # Get relevant facts
+            facts = get_relevant_facts(db, query=message, limit=5)
+            context['facts'] = [
+                {
+                    'fact': f['full_text'],
+                    'type': f['fact_type'],
+                    'confidence': f.get('confidence_score', 0),
+                    'value': f.get('numeric_value'),
+                    'unit': f.get('numeric_unit')
+                }
+                for f in facts
+            ]
+
+            # Get top mentioned entities (always include for context)
+            top_entities = db.query(ZOPKKnowledgeEntity).filter(
+                ZOPKKnowledgeEntity.mentions_count > 1
+            ).order_by(
+                ZOPKKnowledgeEntity.mentions_count.desc()
+            ).limit(10).all()
+
+            context['entities'] = [
+                {
+                    'name': e.name,
+                    'type': e.entity_type,
+                    'description': e.short_description or '',
+                    'mentions': e.mentions_count
+                }
+                for e in top_entities
+            ]
+
+        except Exception as e:
+            logger.error(f"Error getting ZOPK knowledge context: {e}")
+            # Return empty context on error, don't break chat
+
+        return context
+
    def _query_ai(
        self,
        context: Dict[str, Any],
@ -799,6 +945,61 @@ BŁĘDNIE (NIE RÓB - resetuje numerację):
            system_prompt += json.dumps(context['recent_news'], ensure_ascii=False, indent=None)
            system_prompt += "\n"

+        # Add ZOPK Knowledge Base context (semantic search results)
+        if context.get('zopk_knowledge'):
+            zopk = context['zopk_knowledge']
+            system_prompt += "\n\n🌍 BAZA WIEDZY ZOPK (Zielony Okręg Przemysłowy Kaszubia):\n"
+            system_prompt += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
+
+            # Add knowledge chunks (most relevant excerpts)
+            if zopk.get('chunks'):
+                system_prompt += "\n📄 FRAGMENTY WIEDZY (semantycznie dopasowane):\n"
+                for i, chunk in enumerate(zopk['chunks'][:5], 1):
+                    system_prompt += f"\n[{i}] {chunk.get('summary', '')}\n"
+                    system_prompt += f"    Źródło: {chunk.get('source', 'nieznane')} ({chunk.get('date', '')})\n"
+                    if chunk.get('content'):
+                        # Skrócona treść (max 300 znaków)
+                        content_preview = chunk['content'][:300]
+                        if len(chunk['content']) > 300:
+                            content_preview += "..."
+                        system_prompt += f"    Treść: {content_preview}\n"
+
+            # Add verified facts
+            if zopk.get('facts'):
+                system_prompt += "\n📌 ZWERYFIKOWANE FAKTY:\n"
+                for fact in zopk['facts'][:10]:
+                    confidence_stars = "★" * int(fact.get('confidence', 0) * 5)
+                    system_prompt += f"• {fact.get('fact', '')} [{confidence_stars}]\n"
+                    if fact.get('value') and fact.get('unit'):
+                        system_prompt += f"  Wartość: {fact['value']} {fact['unit']}\n"
+
+            # Add key entities
+            if zopk.get('entities'):
+                system_prompt += "\n🏢 KLUCZOWE PODMIOTY ZOPK:\n"
+                for entity in zopk['entities'][:8]:
+                    entity_icon = {
+                        'organization': '🏛️',
+                        'company': '🏢',
+                        'person': '👤',
+                        'location': '📍',
+                        'project': '🎯',
+                        'technology': '⚡'
+                    }.get(entity.get('type', ''), '•')
+                    system_prompt += f"{entity_icon} {entity.get('name', '')} ({entity.get('type', '')})"
+                    if entity.get('description'):
+                        system_prompt += f" - {entity['description']}"
+                    if entity.get('mentions'):
+                        system_prompt += f" [{entity['mentions']} wzmianek]"
+                    system_prompt += "\n"
+
+            system_prompt += "\n🎯 ZASADY ODPOWIEDZI O ZOPK:\n"
+            system_prompt += "1. Odpowiadaj na podstawie bazy wiedzy (nie wymyślaj faktów)\n"
+            system_prompt += "2. Cytuj źródła: \"Według [portal] z [data]...\"\n"
+            system_prompt += "3. Podawaj konkretne daty i liczby gdy dostępne\n"
+            system_prompt += "4. Wymieniaj organizacje i osoby zaangażowane\n"
+            system_prompt += "5. Jeśli brak informacji w bazie - powiedz wprost\n"
+            system_prompt += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
+
        # Add upcoming events (Etap 2)
        if context.get('upcoming_events'):
            system_prompt += "\n\n📅 KALENDARZ WYDARZEŃ:\n"
--- a/zopk_content_scraper.py
+++ b/zopk_content_scraper.py
@ -0,0 +1,670 @@
+"""
+ZOPK Content Scraper - Pobieranie pełnej treści artykułów dla bazy wiedzy.
+
+Scraper respektuje robots.txt i stosuje rate limiting.
+Obsługuje główne polskie portale newsowe.
+
+Usage:
+    from zopk_content_scraper import ZOPKContentScraper
+
+    scraper = ZOPKContentScraper(db_session)
+    result = scraper.scrape_article(news_id=123)
+    # lub batch:
+    result = scraper.batch_scrape(limit=50)
+"""
+
+import re
+import time
+import logging
+import hashlib
+from datetime import datetime
+from typing import Dict, List, Optional, Tuple
+from urllib.parse import urlparse
+from dataclasses import dataclass
+
+import requests
+from bs4 import BeautifulSoup, Comment, NavigableString
+
+from database import ZOPKNews
+
+# Configure logging
+logger = logging.getLogger(__name__)
+
+# ============================================================
+# CONFIGURATION
+# ============================================================
+
+# User-Agent identifying the bot
+USER_AGENT = 'NordaBizBot/1.0 (+https://nordabiznes.pl/bot; kontakt@nordabiznes.pl)'
+
+# Request timeout in seconds
+REQUEST_TIMEOUT = 15
+
+# Maximum content length (chars) to avoid memory issues
+MAX_CONTENT_LENGTH = 100000  # ~100KB of text
+
+# Rate limiting: seconds between requests per domain
+RATE_LIMITS = {
+    'trojmiasto.pl': 2.0,
+    'dziennikbaltycki.pl': 2.0,
+    'nordafm.pl': 1.5,
+    'ttm24.pl': 1.5,
+    'radiogdansk.pl': 1.5,
+    'portalmorski.pl': 1.5,
+    'biznes.pap.pl': 2.0,
+    'default': 3.0
+}
+
+# Maximum retry attempts
+MAX_RETRY_ATTEMPTS = 3
+
+# ============================================================
+# CONTENT SELECTORS PER DOMAIN
+# ============================================================
+
+# CSS selectors for article content extraction
+# Order matters - first match wins
+CONTENT_SELECTORS = {
+    'trojmiasto.pl': [
+        'article.article-content',
+        'div.article-body',
+        'div.article__content',
+        'div[itemprop="articleBody"]',
+    ],
+    'dziennikbaltycki.pl': [
+        'div.article-body',
+        'article.article-main',
+        'div[itemprop="articleBody"]',
+        'div.art-content',
+    ],
+    'nordafm.pl': [
+        'div.entry-content',
+        'article.post-content',
+        'div.post-body',
+    ],
+    'ttm24.pl': [
+        'div.post-content',
+        'article.entry-content',
+        'div.article-content',
+    ],
+    'radiogdansk.pl': [
+        'div.article-content',
+        'div.entry-content',
+        'article.post',
+    ],
+    'portalmorski.pl': [
+        'div.article-content',
+        'div.entry-content',
+        'article.post-content',
+    ],
+    'biznes.pap.pl': [
+        'div.article-content',
+        'div.news-content',
+        'article.content',
+    ],
+    'gov.pl': [
+        'div.article-content',
+        'main.main-content',
+        'div.content',
+    ],
+    'default': [
+        'article',
+        'div[itemprop="articleBody"]',
+        'div.article-content',
+        'div.article-body',
+        'div.entry-content',
+        'div.post-content',
+        'main.content',
+        'main',
+    ]
+}
+
+# Elements to remove from content
+ELEMENTS_TO_REMOVE = [
+    'script', 'style', 'nav', 'header', 'footer', 'aside',
+    'form', 'iframe', 'noscript', 'svg', 'canvas',
+    '.advertisement', '.ad', '.ads', '.advert', '.banner',
+    '.social-share', '.share-buttons', '.sharing',
+    '.related-articles', '.related-posts', '.recommendations',
+    '.comments', '.comment-section', '#comments',
+    '.newsletter', '.subscription', '.subscribe',
+    '.cookie-notice', '.cookie-banner', '.gdpr',
+    '.popup', '.modal', '.overlay',
+    '.sidebar', '.widget', '.navigation',
+    '.breadcrumb', '.breadcrumbs',
+    '.author-bio', '.author-box',
+    '.tags', '.tag-list', '.categories',
+    '.pagination', '.pager',
+    '[data-ad]', '[data-advertisement]',
+]
+
+# Domains that are not scrapeable (paywalls, dynamic content, etc.)
+SKIP_DOMAINS = [
+    'facebook.com',
+    'twitter.com',
+    'x.com',
+    'linkedin.com',
+    'youtube.com',
+    'instagram.com',
+]
+
+
+# ============================================================
+# DATA CLASSES
+# ============================================================
+
+@dataclass
+class ScrapeResult:
+    """Result of scraping an article."""
+    success: bool
+    content: Optional[str] = None
+    word_count: int = 0
+    error: Optional[str] = None
+    status: str = 'pending'  # scraped, failed, skipped
+
+
+# ============================================================
+# SCRAPER CLASS
+# ============================================================
+
+class ZOPKContentScraper:
+    """
+    Scraper for ZOPK news article content.
+
+    Features:
+    - Domain-specific content selectors
+    - Rate limiting per domain
+    - HTML cleaning (removes ads, navigation, etc.)
+    - Retry logic with exponential backoff
+    - robots.txt respect (via User-Agent)
+    """
+
+    def __init__(self, db_session, user_id: Optional[int] = None):
+        """
+        Initialize scraper.
+
+        Args:
+            db_session: SQLAlchemy database session
+            user_id: Optional user ID for audit logging
+        """
+        self.db = db_session
+        self.user_id = user_id
+        self._last_request_time: Dict[str, float] = {}
+        self._session = self._create_session()
+
+    def _create_session(self) -> requests.Session:
+        """Create requests session with proper headers."""
+        session = requests.Session()
+        session.headers.update({
+            'User-Agent': USER_AGENT,
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+            'Accept-Language': 'pl-PL,pl;q=0.9,en;q=0.8',
+            'Accept-Encoding': 'gzip, deflate',
+            'Connection': 'keep-alive',
+        })
+        return session
+
+    def _get_domain(self, url: str) -> str:
+        """Extract domain from URL."""
+        try:
+            parsed = urlparse(url)
+            domain = parsed.netloc.lower()
+            # Remove www. prefix
+            if domain.startswith('www.'):
+                domain = domain[4:]
+            return domain
+        except Exception:
+            return 'unknown'
+
+    def _get_rate_limit(self, domain: str) -> float:
+        """Get rate limit for domain."""
+        # Check exact domain first
+        if domain in RATE_LIMITS:
+            return RATE_LIMITS[domain]
+        # Check if domain ends with known domain
+        for known_domain, limit in RATE_LIMITS.items():
+            if domain.endswith(known_domain):
+                return limit
+        return RATE_LIMITS['default']
+
+    def _wait_for_rate_limit(self, domain: str) -> None:
+        """Wait if needed to respect rate limiting."""
+        limit = self._get_rate_limit(domain)
+        last_time = self._last_request_time.get(domain, 0)
+        elapsed = time.time() - last_time
+        if elapsed < limit:
+            wait_time = limit - elapsed
+            logger.debug(f"Rate limiting: waiting {wait_time:.2f}s for {domain}")
+            time.sleep(wait_time)
+        self._last_request_time[domain] = time.time()
+
+    def _should_skip_domain(self, domain: str) -> bool:
+        """Check if domain should be skipped."""
+        for skip in SKIP_DOMAINS:
+            if skip in domain:
+                return True
+        return False
+
+    def _get_content_selectors(self, domain: str) -> List[str]:
+        """Get CSS selectors for domain."""
+        # Check exact domain
+        if domain in CONTENT_SELECTORS:
+            return CONTENT_SELECTORS[domain]
+        # Check if domain ends with known domain
+        for known_domain, selectors in CONTENT_SELECTORS.items():
+            if known_domain != 'default' and domain.endswith(known_domain):
+                return selectors
+        return CONTENT_SELECTORS['default']
+
+    def _fetch_html(self, url: str) -> Tuple[Optional[str], Optional[str]]:
+        """
+        Fetch HTML content from URL.
+
+        Returns:
+            Tuple of (html_content, error_message)
+        """
+        domain = self._get_domain(url)
+
+        # Check if domain should be skipped
+        if self._should_skip_domain(domain):
+            return None, f"Domain {domain} is not scrapeable (social media/paywall)"
+
+        # Apply rate limiting
+        self._wait_for_rate_limit(domain)
+
+        try:
+            response = self._session.get(
+                url,
+                timeout=REQUEST_TIMEOUT,
+                allow_redirects=True
+            )
+            response.raise_for_status()
+
+            # Check content type
+            content_type = response.headers.get('Content-Type', '')
+            if 'text/html' not in content_type and 'application/xhtml' not in content_type:
+                return None, f"Not HTML content: {content_type}"
+
+            # Detect encoding
+            response.encoding = response.apparent_encoding or 'utf-8'
+
+            return response.text, None
+
+        except requests.exceptions.Timeout:
+            return None, "Request timeout"
+        except requests.exceptions.TooManyRedirects:
+            return None, "Too many redirects"
+        except requests.exceptions.HTTPError as e:
+            return None, f"HTTP error: {e.response.status_code}"
+        except requests.exceptions.ConnectionError:
+            return None, "Connection error"
+        except requests.exceptions.RequestException as e:
+            return None, f"Request error: {str(e)}"
+
+    def _clean_html(self, soup: BeautifulSoup) -> BeautifulSoup:
+        """Remove unwanted elements from HTML."""
+        # Remove comments
+        for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
+            comment.extract()
+
+        # Remove unwanted elements
+        for selector in ELEMENTS_TO_REMOVE:
+            if selector.startswith('.') or selector.startswith('#') or selector.startswith('['):
+                # CSS selector
+                for element in soup.select(selector):
+                    element.decompose()
+            else:
+                # Tag name
+                for element in soup.find_all(selector):
+                    element.decompose()
+
+        return soup
+
+    def _extract_content(self, html: str, domain: str) -> Tuple[Optional[str], Optional[str]]:
+        """
+        Extract article content from HTML.
+
+        Returns:
+            Tuple of (content_text, error_message)
+        """
+        try:
+            soup = BeautifulSoup(html, 'html.parser')
+
+            # Clean HTML first
+            soup = self._clean_html(soup)
+
+            # Try domain-specific selectors
+            selectors = self._get_content_selectors(domain)
+            content_element = None
+
+            for selector in selectors:
+                content_element = soup.select_one(selector)
+                if content_element:
+                    logger.debug(f"Found content with selector: {selector}")
+                    break
+
+            if not content_element:
+                # Fallback: try to find largest text block
+                content_element = self._find_largest_text_block(soup)
+
+            if not content_element:
+                return None, "Could not find article content"
+
+            # Extract text
+            text = self._extract_text(content_element)
+
+            if not text or len(text) < 100:
+                return None, "Extracted content too short"
+
+            # Truncate if too long
+            if len(text) > MAX_CONTENT_LENGTH:
+                text = text[:MAX_CONTENT_LENGTH] + "..."
+                logger.warning(f"Content truncated to {MAX_CONTENT_LENGTH} chars")
+
+            return text, None
+
+        except Exception as e:
+            logger.error(f"Error extracting content: {e}")
+            return None, f"Extraction error: {str(e)}"
+
+    def _find_largest_text_block(self, soup: BeautifulSoup) -> Optional[BeautifulSoup]:
+        """Find the largest text block in the page (fallback method)."""
+        candidates = soup.find_all(['article', 'main', 'div', 'section'])
+
+        best_element = None
+        best_score = 0
+
+        for element in candidates:
+            # Skip small elements
+            text = element.get_text(strip=True)
+            if len(text) < 200:
+                continue
+
+            # Calculate score based on text density and paragraph count
+            paragraphs = len(element.find_all('p'))
+            text_length = len(text)
+
+            # Prefer elements with many paragraphs
+            score = text_length + (paragraphs * 100)
+
+            if score > best_score:
+                best_score = score
+                best_element = element
+
+        return best_element
+
+    def _extract_text(self, element: BeautifulSoup) -> str:
+        """Extract clean text from element."""
+        # Get text with proper spacing
+        lines = []
+
+        for child in element.descendants:
+            if isinstance(child, NavigableString):
+                text = str(child).strip()
+                if text:
+                    lines.append(text)
+            elif child.name in ['br', 'p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']:
+                lines.append('\n')
+
+        # Join and clean
+        text = ' '.join(lines)
+
+        # Clean up whitespace
+        text = re.sub(r'\s+', ' ', text)
+        text = re.sub(r'\n\s*\n', '\n\n', text)
+        text = text.strip()
+
+        return text
+
+    def _count_words(self, text: str) -> int:
+        """Count words in text."""
+        if not text:
+            return 0
+        words = re.findall(r'\b\w+\b', text)
+        return len(words)
+
+    def scrape_article(self, news_id: int) -> ScrapeResult:
+        """
+        Scrape content for a single article.
+
+        Args:
+            news_id: ID of ZOPKNews record
+
+        Returns:
+            ScrapeResult with content or error
+        """
+        # Get news record
+        news = self.db.query(ZOPKNews).filter(ZOPKNews.id == news_id).first()
+
+        if not news:
+            return ScrapeResult(
+                success=False,
+                error=f"News record {news_id} not found",
+                status='failed'
+            )
+
+        # Check if already scraped
+        if news.scrape_status == 'scraped' and news.full_content:
+            return ScrapeResult(
+                success=True,
+                content=news.full_content,
+                word_count=news.content_word_count or 0,
+                status='scraped'
+            )
+
+        url = news.url
+        domain = self._get_domain(url)
+
+        logger.info(f"Scraping article {news_id}: {url}")
+
+        # Check if should skip
+        if self._should_skip_domain(domain):
+            news.scrape_status = 'skipped'
+            news.scrape_error = f"Domain {domain} not scrapeable"
+            self.db.commit()
+            return ScrapeResult(
+                success=False,
+                error=f"Domain {domain} not scrapeable",
+                status='skipped'
+            )
+
+        # Fetch HTML
+        html, fetch_error = self._fetch_html(url)
+
+        if fetch_error:
+            news.scrape_status = 'failed'
+            news.scrape_error = fetch_error
+            news.scrape_attempts = (news.scrape_attempts or 0) + 1
+            self.db.commit()
+            return ScrapeResult(
+                success=False,
+                error=fetch_error,
+                status='failed'
+            )
+
+        # Extract content
+        content, extract_error = self._extract_content(html, domain)
+
+        if extract_error:
+            news.scrape_status = 'failed'
+            news.scrape_error = extract_error
+            news.scrape_attempts = (news.scrape_attempts or 0) + 1
+            self.db.commit()
+            return ScrapeResult(
+                success=False,
+                error=extract_error,
+                status='failed'
+            )
+
+        # Success - update database
+        word_count = self._count_words(content)
+
+        news.full_content = content
+        news.content_word_count = word_count
+        news.content_scraped_at = datetime.now()
+        news.scrape_status = 'scraped'
+        news.scrape_error = None
+        news.scrape_attempts = (news.scrape_attempts or 0) + 1
+
+        self.db.commit()
+
+        logger.info(f"Successfully scraped article {news_id}: {word_count} words")
+
+        return ScrapeResult(
+            success=True,
+            content=content,
+            word_count=word_count,
+            status='scraped'
+        )
+
+    def batch_scrape(
+        self,
+        limit: int = 50,
+        status_filter: Optional[str] = None,
+        force: bool = False
+    ) -> Dict:
+        """
+        Batch scrape articles.
+
+        Args:
+            limit: Maximum number of articles to scrape
+            status_filter: Filter by approval status (approved, auto_approved)
+            force: If True, re-scrape even already scraped articles
+
+        Returns:
+            Dict with statistics
+        """
+        logger.info(f"Starting batch scrape: limit={limit}, force={force}")
+
+        # Build query
+        query = self.db.query(ZOPKNews)
+
+        # Filter by approval status
+        if status_filter:
+            query = query.filter(ZOPKNews.status == status_filter)
+        else:
+            # Default: only approved/auto_approved articles
+            query = query.filter(ZOPKNews.status.in_(['approved', 'auto_approved']))
+
+        # Filter by scrape status
+        if not force:
+            query = query.filter(ZOPKNews.scrape_status.in_(['pending', 'failed']))
+            # Limit retry attempts for failed
+            query = query.filter(
+                (ZOPKNews.scrape_status == 'pending') |
+                ((ZOPKNews.scrape_status == 'failed') & (ZOPKNews.scrape_attempts < MAX_RETRY_ATTEMPTS))
+            )
+
+        # Order by creation date (newest first)
+        query = query.order_by(ZOPKNews.created_at.desc())
+
+        # Limit
+        articles = query.limit(limit).all()
+
+        # Statistics
+        stats = {
+            'total': len(articles),
+            'scraped': 0,
+            'failed': 0,
+            'skipped': 0,
+            'errors': [],
+            'scraped_articles': [],
+            'processing_time': 0
+        }
+
+        start_time = time.time()
+
+        for article in articles:
+            result = self.scrape_article(article.id)
+
+            if result.status == 'scraped':
+                stats['scraped'] += 1
+                stats['scraped_articles'].append({
+                    'id': article.id,
+                    'title': article.title[:100],
+                    'word_count': result.word_count,
+                    'source': article.source_name
+                })
+            elif result.status == 'skipped':
+                stats['skipped'] += 1
+            else:
+                stats['failed'] += 1
+                stats['errors'].append({
+                    'id': article.id,
+                    'url': article.url,
+                    'error': result.error
+                })
+
+        stats['processing_time'] = round(time.time() - start_time, 2)
+
+        logger.info(
+            f"Batch scrape complete: {stats['scraped']} scraped, "
+            f"{stats['failed']} failed, {stats['skipped']} skipped "
+            f"in {stats['processing_time']}s"
+        )
+
+        return stats
+
+    def get_scrape_statistics(self) -> Dict:
+        """Get scraping statistics."""
+        from sqlalchemy import func
+
+        # Count by scrape_status
+        status_counts = self.db.query(
+            ZOPKNews.scrape_status,
+            func.count(ZOPKNews.id)
+        ).filter(
+            ZOPKNews.status.in_(['approved', 'auto_approved'])
+        ).group_by(ZOPKNews.scrape_status).all()
+
+        status_dict = {status: count for status, count in status_counts}
+
+        # Total approved articles
+        total_approved = self.db.query(func.count(ZOPKNews.id)).filter(
+            ZOPKNews.status.in_(['approved', 'auto_approved'])
+        ).scalar()
+
+        # Articles ready for knowledge extraction
+        ready_for_extraction = self.db.query(func.count(ZOPKNews.id)).filter(
+            ZOPKNews.scrape_status == 'scraped',
+            ZOPKNews.knowledge_extracted == False
+        ).scalar()
+
+        # Average word count
+        avg_word_count = self.db.query(func.avg(ZOPKNews.content_word_count)).filter(
+            ZOPKNews.scrape_status == 'scraped'
+        ).scalar()
+
+        return {
+            'total_approved': total_approved or 0,
+            'scraped': status_dict.get('scraped', 0),
+            'pending': status_dict.get('pending', 0) + status_dict.get(None, 0),
+            'failed': status_dict.get('failed', 0),
+            'skipped': status_dict.get('skipped', 0),
+            'ready_for_extraction': ready_for_extraction or 0,
+            'avg_word_count': round(avg_word_count or 0, 0)
+        }
+
+
+# ============================================================
+# STANDALONE FUNCTIONS FOR CRON/CLI
+# ============================================================
+
+def scrape_pending_articles(db_session, limit: int = 50) -> Dict:
+    """
+    Convenience function for cron jobs.
+
+    Usage:
+        from zopk_content_scraper import scrape_pending_articles
+        result = scrape_pending_articles(db_session, limit=50)
+    """
+    scraper = ZOPKContentScraper(db_session)
+    return scraper.batch_scrape(limit=limit)
+
+
+def get_scrape_stats(db_session) -> Dict:
+    """
+    Get scraping statistics for monitoring.
+    """
+    scraper = ZOPKContentScraper(db_session)
+    return scraper.get_scrape_statistics()
--- a/zopk_knowledge_service.py
+++ b/zopk_knowledge_service.py