diff --git a/app.py b/app.py
index 5f169ee..be99c41 100644
--- a/app.py
+++ b/app.py
@@ -11131,6 +11131,354 @@ def admin_zopk_generate_embeddings():
db.close()
+# ============================================================
+# ZOPK SSE ENDPOINTS (Server-Sent Events for Progress Tracking)
+# ============================================================
+
+def sse_progress_generator(operation_func, db, **kwargs):
+ """
+ Generic SSE generator for progress tracking.
+
+ Args:
+ operation_func: Function to call (must accept progress_callback)
+ db: Database session
+ **kwargs: Additional arguments for operation_func
+
+ Yields:
+ SSE formatted progress events
+ """
+ import json
+ from dataclasses import asdict
+
+ progress_queue = []
+
+ def progress_callback(update):
+ progress_queue.append(update)
+
+ def run_operation():
+ try:
+ result = operation_func(progress_callback=progress_callback, **kwargs)
+ return result
+ except Exception as e:
+ logger.error(f"SSE operation error: {e}")
+ return {'error': str(e)}
+
+ # Start operation in separate thread
+ import threading
+ result_container = [None]
+
+ def thread_target():
+ result_container[0] = run_operation()
+
+ thread = threading.Thread(target=thread_target)
+ thread.start()
+
+ # Yield progress updates while thread is running
+ while thread.is_alive() or progress_queue:
+ while progress_queue:
+ update = progress_queue.pop(0)
+ data = asdict(update)
+ yield f"data: {json.dumps(data, ensure_ascii=False)}\n\n"
+
+ if thread.is_alive():
+ import time
+ time.sleep(0.1)
+
+ thread.join()
+
+ # Send final result
+ final_result = result_container[0] or {}
+ yield f"data: {json.dumps({'type': 'result', **final_result}, ensure_ascii=False)}\n\n"
+
+
+@app.route('/admin/zopk/news/scrape-content/stream', methods=['GET'])
+@login_required
+def admin_zopk_scrape_content_stream():
+ """
+ SSE endpoint for streaming scrape progress.
+
+ Query params:
+ - limit: int (default 30) - max articles to scrape
+ - force: bool (default false) - re-scrape already scraped
+ """
+ if not current_user.is_admin:
+ return jsonify({'success': False, 'error': 'Brak uprawnień'}), 403
+
+ from zopk_content_scraper import ZOPKContentScraper
+
+ limit = min(int(request.args.get('limit', 30)), 100)
+ force = request.args.get('force', 'false').lower() == 'true'
+ user_id = current_user.id
+
+ def generate():
+ import json
+ from dataclasses import asdict
+
+ db = SessionLocal()
+ try:
+ scraper = ZOPKContentScraper(db, user_id=user_id)
+
+ def progress_callback(update):
+ data = asdict(update)
+ yield f"data: {json.dumps(data, ensure_ascii=False)}\n\n"
+
+ # This won't work with generator, need different approach
+ # Use a queue-based approach instead
+
+ progress_updates = []
+
+ def queue_callback(update):
+ progress_updates.append(update)
+
+ # Run in this thread, yielding updates as they come
+ from zopk_content_scraper import ZOPKContentScraper, ProgressUpdate
+ import time
+
+ # Get articles to scrape
+ from database import ZOPKNews
+ query = db.query(ZOPKNews).filter(
+ ZOPKNews.status.in_(['approved', 'auto_approved'])
+ )
+
+ if not force:
+ from zopk_content_scraper import MAX_RETRY_ATTEMPTS
+ query = query.filter(ZOPKNews.scrape_status.in_(['pending', 'failed']))
+ query = query.filter(
+ (ZOPKNews.scrape_status == 'pending') |
+ ((ZOPKNews.scrape_status == 'failed') & (ZOPKNews.scrape_attempts < MAX_RETRY_ATTEMPTS))
+ )
+
+ query = query.order_by(ZOPKNews.created_at.desc())
+ articles = query.limit(limit).all()
+ total = len(articles)
+
+ if total == 0:
+ yield f"data: {json.dumps({'status': 'complete', 'message': 'Brak artykułów do scrapowania', 'total': 0}, ensure_ascii=False)}\n\n"
+ return
+
+ # Send initial
+ yield f"data: {json.dumps({'current': 0, 'total': total, 'percent': 0, 'stage': 'scraping', 'status': 'processing', 'message': f'Rozpoczynam scraping {total} artykułów...'}, ensure_ascii=False)}\n\n"
+
+ stats = {'scraped': 0, 'failed': 0, 'skipped': 0}
+ start_time = time.time()
+
+ for idx, article in enumerate(articles, 1):
+ # Send processing update
+ yield f"data: {json.dumps({'current': idx, 'total': total, 'percent': round((idx-1)/total*100, 1), 'stage': 'scraping', 'status': 'processing', 'message': f'Pobieram: {article.title[:50]}...', 'article_id': article.id, 'article_title': article.title[:80], 'details': {'source': article.source_name or 'nieznane', **stats}}, ensure_ascii=False)}\n\n"
+
+ result = scraper.scrape_article(article.id)
+
+ if result.status == 'scraped':
+ stats['scraped'] += 1
+ yield f"data: {json.dumps({'current': idx, 'total': total, 'percent': round(idx/total*100, 1), 'stage': 'scraping', 'status': 'success', 'message': f'✓ {result.word_count} słów: {article.title[:40]}...', 'article_id': article.id, 'details': {'word_count': result.word_count, **stats}}, ensure_ascii=False)}\n\n"
+ elif result.status == 'skipped':
+ stats['skipped'] += 1
+ yield f"data: {json.dumps({'current': idx, 'total': total, 'percent': round(idx/total*100, 1), 'stage': 'scraping', 'status': 'skipped', 'message': f'⊘ Pominięto: {article.title[:40]}...', 'article_id': article.id, 'details': stats}, ensure_ascii=False)}\n\n"
+ else:
+ stats['failed'] += 1
+ error_msg = result.error[:50] if result.error else 'Nieznany błąd'
+ yield f"data: {json.dumps({'current': idx, 'total': total, 'percent': round(idx/total*100, 1), 'stage': 'scraping', 'status': 'failed', 'message': f'✗ {error_msg}', 'article_id': article.id, 'details': {'error': result.error, **stats}}, ensure_ascii=False)}\n\n"
+
+ processing_time = round(time.time() - start_time, 2)
+
+ # Send completion
+ scraped_count = stats['scraped']
+ failed_count = stats['failed']
+ skipped_count = stats['skipped']
+ complete_msg = f'Zakończono: {scraped_count} pobrano, {failed_count} błędów, {skipped_count} pominięto'
+ complete_data = {'current': total, 'total': total, 'percent': 100, 'stage': 'scraping', 'status': 'complete', 'message': complete_msg, 'details': {'processing_time': processing_time, **stats}}
+ yield f"data: {json.dumps(complete_data, ensure_ascii=False)}\n\n"
+
+ except Exception as e:
+ logger.error(f"SSE scraping error: {e}")
+ yield f"data: {json.dumps({'status': 'error', 'message': str(e)}, ensure_ascii=False)}\n\n"
+ finally:
+ db.close()
+
+ return Response(generate(), mimetype='text/event-stream', headers={
+ 'Cache-Control': 'no-cache',
+ 'X-Accel-Buffering': 'no'
+ })
+
+
+@app.route('/admin/zopk/knowledge/extract/stream', methods=['GET'])
+@login_required
+def admin_zopk_knowledge_extract_stream():
+ """
+ SSE endpoint for streaming knowledge extraction progress.
+
+ Query params:
+ - limit: int (default 10) - max articles to process
+ """
+ if not current_user.is_admin:
+ return jsonify({'success': False, 'error': 'Brak uprawnień'}), 403
+
+ limit = min(int(request.args.get('limit', 10)), 50)
+ user_id = current_user.id
+
+ def generate():
+ import json
+ import time
+
+ db = SessionLocal()
+ try:
+ from zopk_knowledge_service import ZOPKKnowledgeService
+ from database import ZOPKNews
+
+ service = ZOPKKnowledgeService(db, user_id=user_id)
+
+ # Find articles ready for extraction
+ articles = db.query(ZOPKNews).filter(
+ ZOPKNews.status.in_(['approved', 'auto_approved']),
+ ZOPKNews.scrape_status == 'scraped',
+ ZOPKNews.knowledge_extracted == False
+ ).order_by(
+ ZOPKNews.created_at.desc()
+ ).limit(limit).all()
+
+ total = len(articles)
+
+ if total == 0:
+ yield f"data: {json.dumps({'status': 'complete', 'message': 'Brak artykułów do ekstrakcji', 'total': 0}, ensure_ascii=False)}\n\n"
+ return
+
+ # Send initial
+ yield f"data: {json.dumps({'current': 0, 'total': total, 'percent': 0, 'stage': 'extracting', 'status': 'processing', 'message': f'Rozpoczynam ekstrakcję z {total} artykułów...'}, ensure_ascii=False)}\n\n"
+
+ stats = {'success': 0, 'failed': 0, 'chunks': 0, 'facts': 0, 'entities': 0}
+ start_time = time.time()
+
+ for idx, article in enumerate(articles, 1):
+ # Send processing update
+ yield f"data: {json.dumps({'current': idx, 'total': total, 'percent': round((idx-1)/total*100, 1), 'stage': 'extracting', 'status': 'processing', 'message': f'Analizuję AI: {article.title[:50]}...', 'article_id': article.id, 'article_title': article.title[:80], 'details': stats}, ensure_ascii=False)}\n\n"
+
+ result = service.extract_from_news(article.id)
+
+ if result.success:
+ stats['success'] += 1
+ stats['chunks'] += result.chunks_created
+ stats['facts'] += result.facts_created
+ stats['entities'] += result.entities_created
+ yield f"data: {json.dumps({'current': idx, 'total': total, 'percent': round(idx/total*100, 1), 'stage': 'extracting', 'status': 'success', 'message': f'✓ {result.chunks_created}ch, {result.facts_created}f, {result.entities_created}e', 'article_id': article.id, 'details': {'new_chunks': result.chunks_created, 'new_facts': result.facts_created, 'new_entities': result.entities_created, **stats}}, ensure_ascii=False)}\n\n"
+ else:
+ stats['failed'] += 1
+ error_msg = result.error[:50] if result.error else 'Nieznany błąd'
+ yield f"data: {json.dumps({'current': idx, 'total': total, 'percent': round(idx/total*100, 1), 'stage': 'extracting', 'status': 'failed', 'message': f'✗ {error_msg}', 'article_id': article.id, 'details': {'error': result.error, **stats}}, ensure_ascii=False)}\n\n"
+
+ processing_time = round(time.time() - start_time, 2)
+
+ # Send completion
+ success_count = stats['success']
+ chunks_count = stats['chunks']
+ facts_count = stats['facts']
+ entities_count = stats['entities']
+ complete_msg = f'Zakończono: {success_count}/{total}. Utworzono: {chunks_count}ch, {facts_count}f, {entities_count}e'
+ complete_data = {'current': total, 'total': total, 'percent': 100, 'stage': 'extracting', 'status': 'complete', 'message': complete_msg, 'details': {'processing_time': processing_time, **stats}}
+ yield f"data: {json.dumps(complete_data, ensure_ascii=False)}\n\n"
+
+ except Exception as e:
+ logger.error(f"SSE extraction error: {e}")
+ yield f"data: {json.dumps({'status': 'error', 'message': str(e)}, ensure_ascii=False)}\n\n"
+ finally:
+ db.close()
+
+ return Response(generate(), mimetype='text/event-stream', headers={
+ 'Cache-Control': 'no-cache',
+ 'X-Accel-Buffering': 'no'
+ })
+
+
+@app.route('/admin/zopk/knowledge/embeddings/stream', methods=['GET'])
+@login_required
+def admin_zopk_embeddings_stream():
+ """
+ SSE endpoint for streaming embeddings generation progress.
+
+ Query params:
+ - limit: int (default 50) - max chunks to process
+ """
+ if not current_user.is_admin:
+ return jsonify({'success': False, 'error': 'Brak uprawnień'}), 403
+
+ limit = min(int(request.args.get('limit', 50)), 200)
+ user_id = current_user.id
+
+ def generate():
+ import json
+ import time
+ from gemini_service import GeminiService
+
+ db = SessionLocal()
+ try:
+ from database import ZOPKKnowledgeChunk
+
+ gemini = GeminiService()
+
+ # Find chunks without embeddings
+ chunks = db.query(ZOPKKnowledgeChunk).filter(
+ ZOPKKnowledgeChunk.embedding.is_(None)
+ ).limit(limit).all()
+
+ total = len(chunks)
+
+ if total == 0:
+ yield f"data: {json.dumps({'status': 'complete', 'message': 'Brak chunks bez embeddingów', 'total': 0}, ensure_ascii=False)}\n\n"
+ return
+
+ # Send initial
+ yield f"data: {json.dumps({'current': 0, 'total': total, 'percent': 0, 'stage': 'embedding', 'status': 'processing', 'message': f'Generuję embeddingi dla {total} chunks...'}, ensure_ascii=False)}\n\n"
+
+ stats = {'success': 0, 'failed': 0}
+ start_time = time.time()
+
+ for idx, chunk in enumerate(chunks, 1):
+ summary_short = chunk.summary[:40] if chunk.summary else f'chunk_{chunk.id}'
+
+ # Send processing update
+ yield f"data: {json.dumps({'current': idx, 'total': total, 'percent': round((idx-1)/total*100, 1), 'stage': 'embedding', 'status': 'processing', 'message': f'Embedding {idx}/{total}: {summary_short}...', 'details': stats}, ensure_ascii=False)}\n\n"
+
+ try:
+ embedding = gemini.generate_embedding(
+ text=chunk.content,
+ task_type='retrieval_document',
+ title=chunk.summary,
+ user_id=user_id,
+ feature='zopk_chunk_embedding'
+ )
+
+ if embedding:
+ chunk.embedding = json.dumps(embedding)
+ stats['success'] += 1
+ yield f"data: {json.dumps({'current': idx, 'total': total, 'percent': round(idx/total*100, 1), 'stage': 'embedding', 'status': 'success', 'message': f'✓ 768 dim: {summary_short}', 'details': stats}, ensure_ascii=False)}\n\n"
+ else:
+ stats['failed'] += 1
+ yield f"data: {json.dumps({'current': idx, 'total': total, 'percent': round(idx/total*100, 1), 'stage': 'embedding', 'status': 'failed', 'message': f'✗ Brak odpowiedzi API', 'details': stats}, ensure_ascii=False)}\n\n"
+
+ except Exception as e:
+ stats['failed'] += 1
+ yield f"data: {json.dumps({'current': idx, 'total': total, 'percent': round(idx/total*100, 1), 'stage': 'embedding', 'status': 'failed', 'message': f'✗ {str(e)[:40]}', 'details': {'error': str(e), **stats}}, ensure_ascii=False)}\n\n"
+
+ db.commit()
+ processing_time = round(time.time() - start_time, 2)
+
+ # Send completion
+ success_count = stats['success']
+ complete_msg = f'Zakończono: {success_count}/{total} embeddingów'
+ complete_data = {'current': total, 'total': total, 'percent': 100, 'stage': 'embedding', 'status': 'complete', 'message': complete_msg, 'details': {'processing_time': processing_time, **stats}}
+ yield f"data: {json.dumps(complete_data, ensure_ascii=False)}\n\n"
+
+ except Exception as e:
+ logger.error(f"SSE embedding error: {e}")
+ yield f"data: {json.dumps({'status': 'error', 'message': str(e)}, ensure_ascii=False)}\n\n"
+ finally:
+ db.close()
+
+ return Response(generate(), mimetype='text/event-stream', headers={
+ 'Cache-Control': 'no-cache',
+ 'X-Accel-Buffering': 'no'
+ })
+
+
@app.route('/api/zopk/knowledge/search', methods=['POST'])
@login_required
def api_zopk_knowledge_search():
diff --git a/templates/admin/zopk_dashboard.html b/templates/admin/zopk_dashboard.html
index 38b01f4..72dd79b 100644
--- a/templates/admin/zopk_dashboard.html
+++ b/templates/admin/zopk_dashboard.html
@@ -1445,6 +1445,252 @@
+
+
+
+
+
+
+
+
+
+
+
+ Inicjalizacja...
+
+
+
+
+
+
+
+
+ ✓ Sukces:
+ 0
+
+
+ ✗ Błędy:
+ 0
+
+
+ ⊘ Pominięto:
+ 0
+
+
+ ⏱️ Czas:
+ 0s
+
+
+
+
+
+ Zamknij i odśwież
+
+
+
+
+
+
@@ -2544,18 +2790,193 @@ async function loadKnowledgeStats() {
}
}
-async function scrapeContent() {
- const btn = document.getElementById('scrapeBtn');
- const originalContent = btn.innerHTML;
+// ===========================================
+// AI Operations Modal Functions
+// ===========================================
+let aiOpsEventSource = null;
+let aiOpsLogEntries = 0;
+
+function openAiOpsModal(title, icon) {
+ const modal = document.getElementById('aiOpsModal');
+ document.getElementById('aiOpsTitle').textContent = title;
+ document.getElementById('aiOpsIcon').textContent = icon;
+ document.getElementById('aiOpsIcon').classList.add('spinning');
+
+ // Reset state
+ document.getElementById('aiOpsProgressFill').style.width = '0%';
+ document.getElementById('aiOpsPercent').textContent = '0%';
+ document.getElementById('aiOpsCounter').textContent = '0 / 0';
+ document.getElementById('aiOpsCurrentText').textContent = 'Inicjalizacja...';
+ document.getElementById('aiOpsLog').innerHTML = '';
+ document.getElementById('aiOpsSummary').style.display = 'none';
+ document.getElementById('aiOpsActions').style.display = 'none';
+ document.getElementById('aiOpsCloseBtn').style.display = 'none';
+ document.getElementById('aiOpsCurrentOp').style.display = 'flex';
+ aiOpsLogEntries = 0;
+ document.getElementById('aiOpsLogCount').textContent = '0 wpisów';
+
+ modal.classList.add('active');
+}
+
+function closeAiOpsModal() {
+ const modal = document.getElementById('aiOpsModal');
+ modal.classList.remove('active');
+
+ // Close SSE connection if active
+ if (aiOpsEventSource) {
+ aiOpsEventSource.close();
+ aiOpsEventSource = null;
+ }
+}
+
+function addAiOpsLogEntry(status, message) {
+ const log = document.getElementById('aiOpsLog');
+ const time = new Date().toLocaleTimeString('pl-PL', { hour: '2-digit', minute: '2-digit', second: '2-digit' });
+
+ const entry = document.createElement('div');
+ entry.className = `log-entry ${status}`;
+ entry.innerHTML = `
+ ${time}
+ ${message}
+ `;
+
+ log.appendChild(entry);
+ log.scrollTop = log.scrollHeight;
+
+ aiOpsLogEntries++;
+ document.getElementById('aiOpsLogCount').textContent = `${aiOpsLogEntries} wpisów`;
+}
+
+function updateAiOpsProgress(data) {
+ // Update progress bar
+ if (data.percent !== undefined) {
+ document.getElementById('aiOpsProgressFill').style.width = `${data.percent}%`;
+ document.getElementById('aiOpsPercent').textContent = `${Math.round(data.percent)}%`;
+ }
+
+ // Update counter
+ if (data.current !== undefined && data.total !== undefined) {
+ document.getElementById('aiOpsCounter').textContent = `${data.current} / ${data.total}`;
+ }
+
+ // Update current operation text
+ if (data.message) {
+ document.getElementById('aiOpsCurrentText').textContent = data.message;
+ }
+
+ // Add log entry
+ if (data.status && data.message) {
+ addAiOpsLogEntry(data.status, data.message);
+ }
+}
+
+function completeAiOpsModal(data) {
+ // Stop spinning
+ document.getElementById('aiOpsIcon').classList.remove('spinning');
+ document.getElementById('aiOpsIcon').textContent = data.status === 'error' ? '❌' : '✅';
+ document.getElementById('aiOpsTitle').textContent = data.status === 'error' ? 'Błąd operacji' : 'Operacja zakończona';
+
+ // Hide current operation
+ document.getElementById('aiOpsCurrentOp').style.display = 'none';
+
+ // Show summary
+ const details = data.details || {};
+ document.getElementById('aiOpsSummarySuccess').textContent = details.success || details.scraped || 0;
+ document.getElementById('aiOpsSummaryFailed').textContent = details.failed || 0;
+
+ if (details.skipped !== undefined) {
+ document.getElementById('aiOpsSummarySkippedRow').style.display = 'flex';
+ document.getElementById('aiOpsSummarySkipped').textContent = details.skipped;
+ }
+
+ if (details.processing_time) {
+ document.getElementById('aiOpsSummaryTime').textContent = `${details.processing_time}s`;
+ }
+
+ document.getElementById('aiOpsSummary').style.display = 'grid';
+ document.getElementById('aiOpsActions').style.display = 'flex';
+ document.getElementById('aiOpsCloseBtn').style.display = 'block';
+}
+
+function startSSEOperation(endpoint, title, icon, limit) {
+ openAiOpsModal(title, icon);
+
+ const url = `${endpoint}?limit=${limit}`;
+ aiOpsEventSource = new EventSource(url);
+
+ aiOpsEventSource.onmessage = function(event) {
+ const data = JSON.parse(event.data);
+
+ if (data.status === 'complete' || data.status === 'error') {
+ aiOpsEventSource.close();
+ aiOpsEventSource = null;
+ completeAiOpsModal(data);
+ } else {
+ updateAiOpsProgress(data);
+ }
+ };
+
+ aiOpsEventSource.onerror = function(event) {
+ console.error('SSE error:', event);
+ aiOpsEventSource.close();
+ aiOpsEventSource = null;
+ completeAiOpsModal({ status: 'error', message: 'Błąd połączenia', details: {} });
+ };
+}
+
+// ===========================================
+// AI Knowledge Base Functions (with SSE)
+// ===========================================
+
+async function scrapeContent() {
const confirmed = await showConfirm(
'Czy chcesz rozpocząć scrapowanie treści artykułów? ' +
- 'Proces pobierze pełną treść z zatwierdzonych newsów które jeszcze nie mają treści. ',
+ 'Proces pobierze pełną treść z zatwierdzonych newsów które jeszcze nie mają treści. ' +
+ 'Postęp będzie wyświetlany na żywo. ',
{ icon: '📄', title: 'Scraping treści', okText: 'Rozpocznij', okClass: 'btn-primary' }
);
if (!confirmed) return;
+ startSSEOperation('/admin/zopk/news/scrape-content/stream', 'Scraping treści artykułów', '📄', 30);
+}
+
+async function extractKnowledge() {
+ const confirmed = await showConfirm(
+ 'Czy chcesz uruchomić ekstrakcję wiedzy przez AI? ' +
+ 'Gemini AI przeanalizuje zescrapowane artykuły i wyekstrahuje: ' +
+ '• Chunks (fragmenty tekstu) ' +
+ '• Fakty (daty, liczby, decyzje) ' +
+ '• Encje (firmy, osoby, projekty) ' +
+ 'Postęp będzie wyświetlany na żywo. ',
+ { icon: '🤖', title: 'Ekstrakcja wiedzy', okText: 'Uruchom AI', okClass: 'btn-primary' }
+ );
+
+ if (!confirmed) return;
+
+ startSSEOperation('/admin/zopk/knowledge/extract/stream', 'Ekstrakcja wiedzy (Gemini AI)', '🤖', 10);
+}
+
+async function generateEmbeddings() {
+ const confirmed = await showConfirm(
+ 'Czy chcesz wygenerować embeddingi dla semantic search? ' +
+ 'Google Text Embedding API przekształci tekst w wektory 768-wymiarowe. ' +
+ 'Embeddingi umożliwiają inteligentne wyszukiwanie w bazie wiedzy. ' +
+ 'Postęp będzie wyświetlany na żywo. ',
+ { icon: '🔍', title: 'Generowanie embeddingów', okText: 'Generuj', okClass: 'btn-primary' }
+ );
+
+ if (!confirmed) return;
+
+ startSSEOperation('/admin/zopk/knowledge/embeddings/stream', 'Generowanie embeddingów', '🔍', 50);
+}
+
+// Keep old code for backward compatibility (non-SSE version - can be removed later)
+async function scrapeContentOld() {
+ const btn = document.getElementById('scrapeBtn');
+ const originalContent = btn.innerHTML;
+
btn.disabled = true;
btn.innerHTML = ' ';
@@ -2581,21 +3002,10 @@ async function scrapeContent() {
}
}
-async function extractKnowledge() {
+async function extractKnowledgeOld() {
const btn = document.getElementById('extractBtn');
const originalContent = btn.innerHTML;
- const confirmed = await showConfirm(
- 'Czy chcesz uruchomić ekstrakcję wiedzy przez AI? ' +
- 'Gemini AI przeanalizuje zescrapowane artykuły i wyekstrahuje: ' +
- '• Chunks (fragmenty tekstu) ' +
- '• Fakty (daty, liczby, decyzje) ' +
- '• Encje (firmy, osoby, projekty) ',
- { icon: '🤖', title: 'Ekstrakcja wiedzy', okText: 'Uruchom AI', okClass: 'btn-primary' }
- );
-
- if (!confirmed) return;
-
btn.disabled = true;
btn.innerHTML = ' ';
@@ -2621,19 +3031,10 @@ async function extractKnowledge() {
}
}
-async function generateEmbeddings() {
+async function generateEmbeddingsOld() {
const btn = document.getElementById('embeddingsBtn');
const originalContent = btn.innerHTML;
- const confirmed = await showConfirm(
- 'Czy chcesz wygenerować embeddingi dla semantic search? ' +
- 'Google Text Embedding API przekształci tekst w wektory 768-wymiarowe. ' +
- 'Embeddingi umożliwiają inteligentne wyszukiwanie w bazie wiedzy. ',
- { icon: '🔍', title: 'Generowanie embeddingów', okText: 'Generuj', okClass: 'btn-primary' }
- );
-
- if (!confirmed) return;
-
btn.disabled = true;
btn.innerHTML = ' ';
diff --git a/zopk_content_scraper.py b/zopk_content_scraper.py
index a380331..cc1a502 100644
--- a/zopk_content_scraper.py
+++ b/zopk_content_scraper.py
@@ -19,9 +19,9 @@ import logging
import hashlib
import base64
from datetime import datetime
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional, Tuple, Callable, Any
from urllib.parse import urlparse, parse_qs, unquote
-from dataclasses import dataclass
+from dataclasses import dataclass, field
import requests
from bs4 import BeautifulSoup, Comment, NavigableString
@@ -312,6 +312,24 @@ class ScrapeResult:
status: str = 'pending' # scraped, failed, skipped
+@dataclass
+class ProgressUpdate:
+ """Progress update for batch operations."""
+ current: int
+ total: int
+ percent: float
+ stage: str # 'scraping', 'extracting', 'embedding'
+ status: str # 'processing', 'success', 'failed', 'complete'
+ message: str
+ details: Dict[str, Any] = field(default_factory=dict)
+ article_id: Optional[int] = None
+ article_title: Optional[str] = None
+
+
+# Type alias for progress callback
+ProgressCallback = Optional[Callable[[ProgressUpdate], None]]
+
+
# ============================================================
# SCRAPER CLASS
# ============================================================
@@ -704,7 +722,8 @@ class ZOPKContentScraper:
self,
limit: int = 50,
status_filter: Optional[str] = None,
- force: bool = False
+ force: bool = False,
+ progress_callback: ProgressCallback = None
) -> Dict:
"""
Batch scrape articles.
@@ -713,6 +732,7 @@ class ZOPKContentScraper:
limit: Maximum number of articles to scrape
status_filter: Filter by approval status (approved, auto_approved)
force: If True, re-scrape even already scraped articles
+ progress_callback: Optional callback for progress updates
Returns:
Dict with statistics
@@ -743,10 +763,11 @@ class ZOPKContentScraper:
# Limit
articles = query.limit(limit).all()
+ total = len(articles)
# Statistics
stats = {
- 'total': len(articles),
+ 'total': total,
'scraped': 0,
'failed': 0,
'skipped': 0,
@@ -755,9 +776,40 @@ class ZOPKContentScraper:
'processing_time': 0
}
+ # Send initial progress
+ if progress_callback and total > 0:
+ progress_callback(ProgressUpdate(
+ current=0,
+ total=total,
+ percent=0.0,
+ stage='scraping',
+ status='processing',
+ message=f'Rozpoczynam scraping {total} artykułów...',
+ details={'scraped': 0, 'failed': 0, 'skipped': 0}
+ ))
+
start_time = time.time()
- for article in articles:
+ for idx, article in enumerate(articles, 1):
+ # Send progress update before processing
+ if progress_callback:
+ progress_callback(ProgressUpdate(
+ current=idx,
+ total=total,
+ percent=round((idx - 1) / total * 100, 1),
+ stage='scraping',
+ status='processing',
+ message=f'Pobieram treść: {article.title[:50]}...',
+ article_id=article.id,
+ article_title=article.title[:80],
+ details={
+ 'scraped': stats['scraped'],
+ 'failed': stats['failed'],
+ 'skipped': stats['skipped'],
+ 'source': article.source_name or 'nieznane'
+ }
+ ))
+
result = self.scrape_article(article.id)
if result.status == 'scraped':
@@ -768,8 +820,37 @@ class ZOPKContentScraper:
'word_count': result.word_count,
'source': article.source_name
})
+ # Send success progress
+ if progress_callback:
+ progress_callback(ProgressUpdate(
+ current=idx,
+ total=total,
+ percent=round(idx / total * 100, 1),
+ stage='scraping',
+ status='success',
+ message=f'✓ Pobrano {result.word_count} słów: {article.title[:40]}...',
+ article_id=article.id,
+ article_title=article.title[:80],
+ details={
+ 'scraped': stats['scraped'],
+ 'failed': stats['failed'],
+ 'skipped': stats['skipped'],
+ 'word_count': result.word_count
+ }
+ ))
elif result.status == 'skipped':
stats['skipped'] += 1
+ if progress_callback:
+ progress_callback(ProgressUpdate(
+ current=idx,
+ total=total,
+ percent=round(idx / total * 100, 1),
+ stage='scraping',
+ status='skipped',
+ message=f'⊘ Pominięto: {article.title[:40]}...',
+ article_id=article.id,
+ details={'scraped': stats['scraped'], 'failed': stats['failed'], 'skipped': stats['skipped']}
+ ))
else:
stats['failed'] += 1
stats['errors'].append({
@@ -777,9 +858,43 @@ class ZOPKContentScraper:
'url': article.url,
'error': result.error
})
+ if progress_callback:
+ progress_callback(ProgressUpdate(
+ current=idx,
+ total=total,
+ percent=round(idx / total * 100, 1),
+ stage='scraping',
+ status='failed',
+ message=f'✗ Błąd: {result.error[:50]}...' if result.error else '✗ Błąd',
+ article_id=article.id,
+ article_title=article.title[:80],
+ details={
+ 'scraped': stats['scraped'],
+ 'failed': stats['failed'],
+ 'skipped': stats['skipped'],
+ 'error': result.error
+ }
+ ))
stats['processing_time'] = round(time.time() - start_time, 2)
+ # Send completion progress
+ if progress_callback:
+ progress_callback(ProgressUpdate(
+ current=total,
+ total=total,
+ percent=100.0,
+ stage='scraping',
+ status='complete',
+ message=f'Zakończono: {stats["scraped"]} pobrano, {stats["failed"]} błędów, {stats["skipped"]} pominięto',
+ details={
+ 'scraped': stats['scraped'],
+ 'failed': stats['failed'],
+ 'skipped': stats['skipped'],
+ 'processing_time': stats['processing_time']
+ }
+ ))
+
logger.info(
f"Batch scrape complete: {stats['scraped']} scraped, "
f"{stats['failed']} failed, {stats['skipped']} skipped "
diff --git a/zopk_knowledge_service.py b/zopk_knowledge_service.py
index 5e1dad4..93a7cba 100644
--- a/zopk_knowledge_service.py
+++ b/zopk_knowledge_service.py
@@ -22,9 +22,12 @@ import json
import logging
import hashlib
from datetime import datetime
-from typing import Dict, List, Optional, Tuple, Any
+from typing import Dict, List, Optional, Tuple, Any, Callable
from dataclasses import dataclass, field
+# Import progress tracking from scraper
+from zopk_content_scraper import ProgressUpdate, ProgressCallback
+
from database import (
ZOPKNews,
ZOPKKnowledgeChunk,
@@ -663,12 +666,13 @@ class ZOPKKnowledgeService:
processing_time=processing_time
)
- def batch_extract(self, limit: int = 50) -> Dict:
+ def batch_extract(self, limit: int = 50, progress_callback: ProgressCallback = None) -> Dict:
"""
Batch extract knowledge from scraped articles.
Args:
limit: Maximum number of articles to process
+ progress_callback: Optional callback for progress updates
Returns:
Dict with statistics
@@ -686,8 +690,9 @@ class ZOPKKnowledgeService:
ZOPKNews.created_at.desc()
).limit(limit).all()
+ total = len(articles)
stats = {
- 'total': len(articles),
+ 'total': total,
'success': 0,
'failed': 0,
'chunks_created': 0,
@@ -698,9 +703,41 @@ class ZOPKKnowledgeService:
'processing_time': 0
}
+ # Send initial progress
+ if progress_callback and total > 0:
+ progress_callback(ProgressUpdate(
+ current=0,
+ total=total,
+ percent=0.0,
+ stage='extracting',
+ status='processing',
+ message=f'Rozpoczynam ekstrakcję wiedzy z {total} artykułów...',
+ details={'success': 0, 'failed': 0, 'chunks': 0, 'facts': 0, 'entities': 0}
+ ))
+
start_time = time.time()
- for article in articles:
+ for idx, article in enumerate(articles, 1):
+ # Send progress update before processing
+ if progress_callback:
+ progress_callback(ProgressUpdate(
+ current=idx,
+ total=total,
+ percent=round((idx - 1) / total * 100, 1),
+ stage='extracting',
+ status='processing',
+ message=f'Analizuję przez AI: {article.title[:50]}...',
+ article_id=article.id,
+ article_title=article.title[:80],
+ details={
+ 'success': stats['success'],
+ 'failed': stats['failed'],
+ 'chunks': stats['chunks_created'],
+ 'facts': stats['facts_created'],
+ 'entities': stats['entities_created']
+ }
+ ))
+
result = self.extract_from_news(article.id)
if result.success:
@@ -709,6 +746,28 @@ class ZOPKKnowledgeService:
stats['facts_created'] += result.facts_created
stats['entities_created'] += result.entities_created
stats['relations_created'] += result.relations_created
+
+ if progress_callback:
+ progress_callback(ProgressUpdate(
+ current=idx,
+ total=total,
+ percent=round(idx / total * 100, 1),
+ stage='extracting',
+ status='success',
+ message=f'✓ Wyekstrahowano: {result.chunks_created} chunks, {result.facts_created} faktów, {result.entities_created} encji',
+ article_id=article.id,
+ article_title=article.title[:80],
+ details={
+ 'success': stats['success'],
+ 'failed': stats['failed'],
+ 'chunks': stats['chunks_created'],
+ 'facts': stats['facts_created'],
+ 'entities': stats['entities_created'],
+ 'new_chunks': result.chunks_created,
+ 'new_facts': result.facts_created,
+ 'new_entities': result.entities_created
+ }
+ ))
else:
stats['failed'] += 1
if result.error:
@@ -718,8 +777,47 @@ class ZOPKKnowledgeService:
'error': result.error
})
+ if progress_callback:
+ progress_callback(ProgressUpdate(
+ current=idx,
+ total=total,
+ percent=round(idx / total * 100, 1),
+ stage='extracting',
+ status='failed',
+ message=f'✗ Błąd ekstrakcji: {result.error[:50]}...' if result.error else '✗ Błąd',
+ article_id=article.id,
+ article_title=article.title[:80],
+ details={
+ 'success': stats['success'],
+ 'failed': stats['failed'],
+ 'error': result.error
+ }
+ ))
+
stats['processing_time'] = round(time.time() - start_time, 2)
+ # Send completion progress
+ if progress_callback:
+ progress_callback(ProgressUpdate(
+ current=total,
+ total=total,
+ percent=100.0,
+ stage='extracting',
+ status='complete',
+ message=f'Zakończono: {stats["success"]}/{total} artykułów. '
+ f'Utworzono: {stats["chunks_created"]} chunks, {stats["facts_created"]} faktów, '
+ f'{stats["entities_created"]} encji',
+ details={
+ 'success': stats['success'],
+ 'failed': stats['failed'],
+ 'chunks': stats['chunks_created'],
+ 'facts': stats['facts_created'],
+ 'entities': stats['entities_created'],
+ 'relations': stats['relations_created'],
+ 'processing_time': stats['processing_time']
+ }
+ ))
+
logger.info(
f"Batch extraction complete: {stats['success']}/{stats['total']} success "
f"in {stats['processing_time']}s"
@@ -950,7 +1048,12 @@ def get_relevant_facts(
return results[:limit]
-def generate_chunk_embeddings(db_session, limit: int = 100, user_id: Optional[int] = None) -> Dict:
+def generate_chunk_embeddings(
+ db_session,
+ limit: int = 100,
+ user_id: Optional[int] = None,
+ progress_callback: ProgressCallback = None
+) -> Dict:
"""
Generate embeddings for chunks that don't have them.
@@ -958,11 +1061,13 @@ def generate_chunk_embeddings(db_session, limit: int = 100, user_id: Optional[in
db_session: SQLAlchemy session
limit: Max chunks to process
user_id: User ID for cost tracking
+ progress_callback: Optional callback for progress updates
Returns:
Dict with statistics
"""
import json
+ import time
from gemini_service import GeminiService
gemini = GeminiService()
@@ -972,13 +1077,52 @@ def generate_chunk_embeddings(db_session, limit: int = 100, user_id: Optional[in
ZOPKKnowledgeChunk.embedding.is_(None)
).limit(limit).all()
+ total = len(chunks)
stats = {
- 'total': len(chunks),
+ 'total': total,
'success': 0,
- 'failed': 0
+ 'failed': 0,
+ 'processing_time': 0
}
- for chunk in chunks:
+ # Send initial progress
+ if progress_callback and total > 0:
+ progress_callback(ProgressUpdate(
+ current=0,
+ total=total,
+ percent=0.0,
+ stage='embedding',
+ status='processing',
+ message=f'Rozpoczynam generowanie embeddingów dla {total} chunks...',
+ details={'success': 0, 'failed': 0}
+ ))
+
+ start_time = time.time()
+
+ for idx, chunk in enumerate(chunks, 1):
+ # Send progress update before processing
+ if progress_callback:
+ # Get article title from chunk's source news
+ article_title = None
+ if chunk.source_news:
+ article_title = chunk.source_news.title[:80]
+
+ progress_callback(ProgressUpdate(
+ current=idx,
+ total=total,
+ percent=round((idx - 1) / total * 100, 1),
+ stage='embedding',
+ status='processing',
+ message=f'Generuję embedding {idx}/{total}: {chunk.summary[:40] if chunk.summary else "chunk"}...',
+ article_id=chunk.source_news_id,
+ article_title=article_title,
+ details={
+ 'success': stats['success'],
+ 'failed': stats['failed'],
+ 'chunk_id': chunk.id
+ }
+ ))
+
try:
embedding = gemini.generate_embedding(
text=chunk.content,
@@ -992,14 +1136,70 @@ def generate_chunk_embeddings(db_session, limit: int = 100, user_id: Optional[in
# Store as JSON string
chunk.embedding = json.dumps(embedding)
stats['success'] += 1
+
+ if progress_callback:
+ progress_callback(ProgressUpdate(
+ current=idx,
+ total=total,
+ percent=round(idx / total * 100, 1),
+ stage='embedding',
+ status='success',
+ message=f'✓ Wygenerowano embedding (768 dim)',
+ article_id=chunk.source_news_id,
+ details={
+ 'success': stats['success'],
+ 'failed': stats['failed'],
+ 'chunk_id': chunk.id
+ }
+ ))
else:
stats['failed'] += 1
+ if progress_callback:
+ progress_callback(ProgressUpdate(
+ current=idx,
+ total=total,
+ percent=round(idx / total * 100, 1),
+ stage='embedding',
+ status='failed',
+ message='✗ Nie udało się wygenerować embeddingu',
+ article_id=chunk.source_news_id,
+ details={'success': stats['success'], 'failed': stats['failed']}
+ ))
except Exception as e:
logger.error(f"Error generating embedding for chunk {chunk.id}: {e}")
stats['failed'] += 1
+ if progress_callback:
+ progress_callback(ProgressUpdate(
+ current=idx,
+ total=total,
+ percent=round(idx / total * 100, 1),
+ stage='embedding',
+ status='failed',
+ message=f'✗ Błąd: {str(e)[:50]}...',
+ article_id=chunk.source_news_id,
+ details={'success': stats['success'], 'failed': stats['failed'], 'error': str(e)}
+ ))
+
db_session.commit()
+ stats['processing_time'] = round(time.time() - start_time, 2)
+
+ # Send completion progress
+ if progress_callback:
+ progress_callback(ProgressUpdate(
+ current=total,
+ total=total,
+ percent=100.0,
+ stage='embedding',
+ status='complete',
+ message=f'Zakończono: {stats["success"]}/{total} embeddingów wygenerowanych',
+ details={
+ 'success': stats['success'],
+ 'failed': stats['failed'],
+ 'processing_time': stats['processing_time']
+ }
+ ))
logger.info(f"Generated embeddings: {stats['success']}/{stats['total']} success")