feat: add website discovery service for companies without websites
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions

Automated discovery using Brave Search API to find company websites,
scrape verification data (NIP/REGON/KRS/email/phone), and present
candidates with match badges in the data quality dashboard.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Maciej Pienczyn 2026-02-21 08:27:13 +01:00
parent 01bc40132e
commit 126eff8af6
7 changed files with 1021 additions and 1 deletions

View File

@ -31,3 +31,4 @@ from . import routes_competitors # noqa: E402, F401
from . import routes_social_publisher # noqa: E402, F401
from . import routes_data_quality # noqa: E402, F401
from . import routes_bulk_enrichment # noqa: E402, F401
from . import routes_website_discovery # noqa: E402, F401

View File

@ -16,7 +16,8 @@ from sqlalchemy import func
from . import bp
from database import (
SessionLocal, Company, CompanyWebsiteAnalysis,
CompanySocialMedia, GBPAudit, SystemRole
CompanySocialMedia, GBPAudit, SystemRole,
WebsiteDiscoveryCandidate
)
from utils.decorators import role_required
from utils.data_quality import compute_weighted_score
@ -212,6 +213,48 @@ def admin_data_quality():
'google_name': g_name,
})
# Website discovery candidates (pending)
discovery_candidates = db.query(WebsiteDiscoveryCandidate).filter(
WebsiteDiscoveryCandidate.status == 'pending',
WebsiteDiscoveryCandidate.candidate_url != 'none',
).order_by(WebsiteDiscoveryCandidate.match_score.desc()).all()
# Enrich with company name
discovery_data = []
for dc in discovery_candidates:
comp = company_map.get(dc.company_id)
if not comp:
continue
discovery_data.append({
'id': dc.id,
'company_id': dc.company_id,
'company_name': comp.name,
'company_slug': comp.slug,
'url': dc.candidate_url,
'domain': dc.candidate_domain or '',
'title': dc.brave_title or '',
'description': (dc.brave_description or '')[:100],
'match_nip': dc.match_nip,
'match_regon': dc.match_regon,
'match_krs': dc.match_krs,
'match_phone': dc.match_phone,
'match_email': dc.match_email,
'match_city': dc.match_city,
'match_owner': dc.match_owner,
'confidence': dc.confidence,
'score': dc.match_score,
'has_nip': bool(comp.nip),
'has_regon': bool(comp.regon),
'has_krs': bool(comp.krs),
'has_phone': bool(comp.phone),
'has_email': bool(comp.email),
'has_city': bool(comp.address_city),
'has_owner': bool(getattr(comp, 'owner_name', None)),
})
# Count companies without website
companies_without_website = sum(1 for c in companies_table if not c['website'])
return render_template(
'admin/data_quality_dashboard.html',
total=total,
@ -221,6 +264,8 @@ def admin_data_quality():
avg_score=avg_score,
companies_table=companies_table,
available_data=available_data,
discovery_data=discovery_data,
companies_without_website=companies_without_website,
now=now,
)
finally:

View File

@ -0,0 +1,167 @@
"""
Admin Website Discovery Routes
================================
Endpoints for discovering and managing website candidates for companies.
"""
import logging
import threading
from datetime import datetime
from flask import request, jsonify
from flask_login import login_required
from . import bp
from database import SessionLocal, Company, WebsiteDiscoveryCandidate, SystemRole
from utils.decorators import role_required
from utils.data_quality import update_company_data_quality
from services.website_discovery_service import WebsiteDiscoveryService
logger = logging.getLogger(__name__)
# Store bulk job progress
_bulk_jobs = {}
@bp.route('/discover-website/<int:company_id>', methods=['POST'])
@login_required
@role_required(SystemRole.ADMIN)
def discover_website(company_id):
"""Discover website for a single company."""
db = SessionLocal()
try:
company = db.query(Company).get(company_id)
if not company:
return jsonify({'error': 'Firma nie znaleziona'}), 404
service = WebsiteDiscoveryService(db=db)
result = service.discover_for_company(company)
if result.get('error'):
return jsonify({'success': False, 'error': result['error']})
return jsonify({'success': True, **result})
except Exception as e:
logger.error(f"Discovery error: {e}")
return jsonify({'success': False, 'error': str(e)}), 500
finally:
db.close()
@bp.route('/discover-websites-bulk', methods=['POST'])
@login_required
@role_required(SystemRole.ADMIN)
def discover_websites_bulk():
"""Start bulk website discovery in background."""
import uuid
job_id = str(uuid.uuid4())[:8]
def run_bulk(job_id):
_bulk_jobs[job_id] = {'status': 'running', 'processed': 0, 'total': 0, 'latest_result': ''}
db = SessionLocal()
try:
companies = db.query(Company).filter(
Company.status.in_(['active', 'pending']),
(Company.website == None) | (Company.website == ''),
).order_by(Company.name).limit(50).all()
_bulk_jobs[job_id]['total'] = len(companies)
service = WebsiteDiscoveryService(db=db)
import time
for company in companies:
result = service.discover_for_company(company)
_bulk_jobs[job_id]['processed'] += 1
status_text = f"{company.name}: "
if result.get('status') == 'found':
status_text += f"znaleziono {result.get('url', '?')} ({result.get('confidence', '?')})"
elif result.get('status') == 'exists':
status_text += "kandydat już istnieje"
else:
status_text += result.get('error', 'błąd')
_bulk_jobs[job_id]['latest_result'] = status_text
if _bulk_jobs[job_id]['processed'] < _bulk_jobs[job_id]['total']:
time.sleep(2)
_bulk_jobs[job_id]['status'] = 'completed'
except Exception as e:
logger.error(f"Bulk discovery error: {e}")
_bulk_jobs[job_id]['status'] = 'error'
_bulk_jobs[job_id]['latest_result'] = str(e)
finally:
db.close()
thread = threading.Thread(target=run_bulk, args=(job_id,), daemon=True)
thread.start()
return jsonify({'success': True, 'job_id': job_id})
@bp.route('/discover-websites-status')
@login_required
@role_required(SystemRole.ADMIN)
def discover_websites_status():
"""Poll bulk discovery progress."""
job_id = request.args.get('job_id')
if not job_id or job_id not in _bulk_jobs:
return jsonify({'error': 'Job not found'}), 404
return jsonify(_bulk_jobs[job_id])
@bp.route('/discovery/<int:candidate_id>/accept', methods=['POST'])
@login_required
@role_required(SystemRole.ADMIN)
def accept_discovery(candidate_id):
"""Accept a discovery candidate - set company.website."""
db = SessionLocal()
try:
candidate = db.query(WebsiteDiscoveryCandidate).get(candidate_id)
if not candidate:
return jsonify({'error': 'Kandydat nie znaleziony'}), 404
company = db.query(Company).get(candidate.company_id)
if not company:
return jsonify({'error': 'Firma nie znaleziona'}), 404
# Set website
company.website = candidate.candidate_url
candidate.status = 'accepted'
candidate.reviewed_at = datetime.now()
# Update data quality
update_company_data_quality(company, db)
db.commit()
logger.info(f"Accepted website {candidate.candidate_url} for company {company.name}")
return jsonify({'success': True, 'url': candidate.candidate_url})
except Exception as e:
db.rollback()
logger.error(f"Accept error: {e}")
return jsonify({'error': str(e)}), 500
finally:
db.close()
@bp.route('/discovery/<int:candidate_id>/reject', methods=['POST'])
@login_required
@role_required(SystemRole.ADMIN)
def reject_discovery(candidate_id):
"""Reject a discovery candidate."""
db = SessionLocal()
try:
candidate = db.query(WebsiteDiscoveryCandidate).get(candidate_id)
if not candidate:
return jsonify({'error': 'Kandydat nie znaleziony'}), 404
candidate.status = 'rejected'
candidate.reviewed_at = datetime.now()
db.commit()
return jsonify({'success': True})
except Exception as e:
db.rollback()
return jsonify({'error': str(e)}), 500
finally:
db.close()

View File

@ -5463,6 +5463,47 @@ class SocialMediaConfig(Base):
return f'<SocialMediaConfig {self.platform} company_id={self.company_id} page={self.page_name}>'
class WebsiteDiscoveryCandidate(Base):
"""Website candidates found via Brave Search for companies missing website field."""
__tablename__ = 'website_discovery_candidates'
id = Column(Integer, primary_key=True)
company_id = Column(Integer, ForeignKey('companies.id'), nullable=False)
discovered_at = Column(DateTime, default=datetime.now)
search_query = Column(Text)
candidate_url = Column(String(500), nullable=False)
candidate_domain = Column(String(255))
brave_title = Column(Text)
brave_description = Column(Text)
extracted_nips = Column(PG_ARRAY(Text))
extracted_regons = Column(PG_ARRAY(Text))
extracted_krs = Column(PG_ARRAY(Text))
extracted_phones = Column(PG_ARRAY(Text))
extracted_emails = Column(PG_ARRAY(Text))
page_text_snippet = Column(Text)
match_nip = Column(Boolean, default=False)
match_regon = Column(Boolean, default=False)
match_krs = Column(Boolean, default=False)
match_phone = Column(Boolean, default=False)
match_email = Column(Boolean, default=False)
match_city = Column(Boolean, default=False)
match_owner = Column(Boolean, default=False)
confidence = Column(String(10), default='low')
match_score = Column(Integer, default=0)
status = Column(String(20), default='pending')
reviewed_at = Column(DateTime)
error_message = Column(Text)
company = relationship('Company', foreign_keys=[company_id])
__table_args__ = (
UniqueConstraint('company_id', 'candidate_url', name='uq_wdc_company_url'),
)
def __repr__(self):
return f'<WebsiteDiscoveryCandidate {self.id} company={self.company_id} confidence={self.confidence}>'
# ============================================================
# DATABASE INITIALIZATION
# ============================================================

View File

@ -0,0 +1,38 @@
-- Website Discovery Candidates
-- Stores website candidates found via Brave Search for companies missing website field
CREATE TABLE IF NOT EXISTS website_discovery_candidates (
id SERIAL PRIMARY KEY,
company_id INTEGER NOT NULL REFERENCES companies(id) ON DELETE CASCADE,
discovered_at TIMESTAMP DEFAULT NOW(),
search_query TEXT,
candidate_url VARCHAR(500) NOT NULL,
candidate_domain VARCHAR(255),
brave_title TEXT,
brave_description TEXT,
extracted_nips TEXT[],
extracted_regons TEXT[],
extracted_krs TEXT[],
extracted_phones TEXT[],
extracted_emails TEXT[],
page_text_snippet TEXT,
match_nip BOOLEAN DEFAULT FALSE,
match_regon BOOLEAN DEFAULT FALSE,
match_krs BOOLEAN DEFAULT FALSE,
match_phone BOOLEAN DEFAULT FALSE,
match_email BOOLEAN DEFAULT FALSE,
match_city BOOLEAN DEFAULT FALSE,
match_owner BOOLEAN DEFAULT FALSE,
confidence VARCHAR(10) DEFAULT 'low',
match_score INTEGER DEFAULT 0,
status VARCHAR(20) DEFAULT 'pending',
reviewed_at TIMESTAMP,
error_message TEXT,
UNIQUE(company_id, candidate_url)
);
CREATE INDEX IF NOT EXISTS idx_wdc_status ON website_discovery_candidates(status);
CREATE INDEX IF NOT EXISTS idx_wdc_company ON website_discovery_candidates(company_id);
GRANT ALL ON TABLE website_discovery_candidates TO nordabiz_app;
GRANT USAGE, SELECT ON SEQUENCE website_discovery_candidates_id_seq TO nordabiz_app;

View File

@ -0,0 +1,481 @@
"""
Website Discovery Service
==========================
Discovers websites for companies that don't have one registered.
Uses Brave Web Search API to find candidates, scrapes them for verification data,
and compares extracted information against known company data.
"""
import os
import re
import time
import logging
from urllib.parse import urlparse
from datetime import datetime
import requests
from bs4 import BeautifulSoup
from database import SessionLocal, Company, WebsiteDiscoveryCandidate
from utils.data_quality import update_company_data_quality
logger = logging.getLogger(__name__)
# Domains to skip - business directories, social media, own portal
DIRECTORY_DOMAINS = {
'panoramafirm.pl', 'aleo.com', 'rejestr.io', 'krs-pobierz.pl',
'gowork.pl', 'oferteo.pl', 'pkt.pl', 'firmy.net', 'zumi.pl',
'baza-firm.com.pl', 'e-krs.pl', 'krs-online.com.pl', 'regon.info',
'infoveriti.pl', 'companywall.pl', 'findcompany.pl', 'owg.pl',
'facebook.com', 'linkedin.com', 'youtube.com', 'instagram.com',
'twitter.com', 'x.com', 'tiktok.com',
'nordabiznes.pl', 'google.com', 'google.pl',
'wikipedia.org', 'olx.pl', 'allegro.pl',
}
# --- Extraction helpers ---
def _normalize_nip(nip):
return re.sub(r'[^0-9]', '', nip)
def _validate_nip(nip):
nip = _normalize_nip(nip)
if len(nip) != 10:
return False
weights = [6, 5, 7, 2, 3, 4, 5, 6, 7]
try:
checksum = sum(int(nip[i]) * weights[i] for i in range(9)) % 11
return checksum == int(nip[9])
except (ValueError, IndexError):
return False
def _validate_regon(regon):
regon = re.sub(r'[^0-9]', '', regon)
if len(regon) == 9:
weights = [8, 9, 2, 3, 4, 5, 6, 7]
checksum = sum(int(regon[i]) * weights[i] for i in range(8)) % 11
if checksum == 10:
checksum = 0
return checksum == int(regon[8])
elif len(regon) == 14:
weights9 = [8, 9, 2, 3, 4, 5, 6, 7]
checksum9 = sum(int(regon[i]) * weights9[i] for i in range(8)) % 11
if checksum9 == 10:
checksum9 = 0
if checksum9 != int(regon[8]):
return False
weights14 = [2, 4, 8, 5, 0, 9, 7, 3, 6, 1, 2, 4, 8]
checksum14 = sum(int(regon[i]) * weights14[i] for i in range(13)) % 11
if checksum14 == 10:
checksum14 = 0
return checksum14 == int(regon[13])
return False
def _find_nips_in_text(text):
patterns = [
r'NIP[:\s]*(\d{3}[-\s]?\d{3}[-\s]?\d{2}[-\s]?\d{2})',
r'NIP[:\s]*(\d{10})',
r'numer\s+identyfikacji\s+podatkowej[:\s]*(\d{10})',
]
nips = []
for pattern in patterns:
matches = re.findall(pattern, text, re.IGNORECASE)
for match in matches:
nip = _normalize_nip(match)
if _validate_nip(nip) and nip not in nips:
nips.append(nip)
return nips
def _find_regons_in_text(text):
patterns = [
r'REGON[:\s]*(\d{9,14})',
r'rejestr\s+gospodarczy[:\s]*(\d{9,14})',
]
regons = []
for pattern in patterns:
matches = re.findall(pattern, text, re.IGNORECASE)
for match in matches:
regon = re.sub(r'[^0-9]', '', match)
if _validate_regon(regon) and regon not in regons:
regons.append(regon)
return regons
def _find_krs_in_text(text):
patterns = [
r'KRS[:\s]*(\d{10})',
r'Krajow\w+\s+Rejestr\w*\s+S[aą]dow\w*[:\s]*(\d{10})',
]
krs_numbers = []
for pattern in patterns:
matches = re.findall(pattern, text, re.IGNORECASE)
for match in matches:
krs = re.sub(r'[^0-9]', '', match)
if len(krs) == 10 and krs not in krs_numbers:
krs_numbers.append(krs)
return krs_numbers
def _extract_emails(text):
email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
emails = re.findall(email_pattern, text)
skip_domains = {
'example.com', 'sentry.io', 'wixpress.com', 'wordpress.org',
'w3.org', 'schema.org', 'googleapis.com',
}
return list(dict.fromkeys(
e.lower() for e in emails
if not any(d in e.lower() for d in skip_domains)
))[:5]
def _extract_phones(text):
phone_patterns = [
r'(?:\+48\s?)?\d{2}[\s-]?\d{3}[\s-]?\d{2}[\s-]?\d{2}',
r'(?:\+48\s?)?\d{3}[\s-]?\d{3}[\s-]?\d{3}',
r'\(\d{2}\)\s?\d{3}[\s-]?\d{2}[\s-]?\d{2}',
r'(?:tel|phone|telefon)[.:]\s*[\+]?\d[\d\s\-]{7,14}',
]
phones = []
for pattern in phone_patterns:
matches = re.findall(pattern, text, re.IGNORECASE)
for m in matches:
clean = re.sub(r'(?:tel|phone|telefon)[.:]?\s*', '', m, flags=re.IGNORECASE).strip()
digits = re.sub(r'\D', '', clean)
if 9 <= len(digits) <= 12:
phones.append(clean)
return list(dict.fromkeys(phones))[:5]
def _is_directory_domain(url):
"""Check if URL belongs to a known business directory."""
try:
domain = urlparse(url).netloc.lower()
# Strip www.
if domain.startswith('www.'):
domain = domain[4:]
return any(domain == d or domain.endswith('.' + d) for d in DIRECTORY_DOMAINS)
except Exception:
return False
def _fetch_page_text(url, timeout=15):
"""Fetch URL and return plain text content."""
headers = {
'User-Agent': 'Mozilla/5.0 (compatible; NordaBizBot/1.0)',
'Accept': 'text/html,application/xhtml+xml',
}
try:
resp = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True)
content_type = resp.headers.get('Content-Type', '')
if 'text/html' not in content_type and 'application/xhtml' not in content_type:
return None
resp.encoding = resp.apparent_encoding or 'utf-8'
soup = BeautifulSoup(resp.text, 'html.parser')
# Remove non-content elements
for tag in soup.find_all(['script', 'style', 'nav', 'footer', 'header', 'noscript']):
tag.decompose()
text = soup.get_text(separator=' ', strip=True)
return text
except requests.exceptions.SSLError:
# Retry with HTTP
try:
http_url = url.replace('https://', 'http://')
resp = requests.get(http_url, headers=headers, timeout=timeout, allow_redirects=True)
resp.encoding = resp.apparent_encoding or 'utf-8'
soup = BeautifulSoup(resp.text, 'html.parser')
for tag in soup.find_all(['script', 'style', 'nav', 'footer', 'header', 'noscript']):
tag.decompose()
return soup.get_text(separator=' ', strip=True)
except Exception:
return None
except Exception as e:
logger.warning(f"Failed to fetch {url}: {e}")
return None
class WebsiteDiscoveryService:
"""Discovers and validates website candidates for companies."""
def __init__(self, db=None):
self.db = db
self.brave_api_key = os.getenv('BRAVE_API_KEY')
def discover_for_company(self, company):
"""
Search for website, scrape, compare, save candidate.
Returns dict with result info.
"""
if not self.brave_api_key:
return {'error': 'BRAVE_API_KEY not configured'}
db = self.db or SessionLocal()
own_session = self.db is None
try:
# Build search query
city = company.address_city or ''
query = f'"{company.name}" {city}'.strip()
# Search Brave
urls = self._search_brave(query)
if not urls:
# Save error candidate
candidate = WebsiteDiscoveryCandidate(
company_id=company.id,
search_query=query,
candidate_url='none',
status='error',
error_message='Brak wyników wyszukiwania',
)
try:
db.add(candidate)
db.commit()
except Exception:
db.rollback()
return {'error': 'Brak wyników', 'company_id': company.id}
# Take best candidate (first non-directory URL)
best = urls[0]
url = best['url']
domain = urlparse(url).netloc.lower()
if domain.startswith('www.'):
domain = domain[4:]
# Check for existing candidate
existing = db.query(WebsiteDiscoveryCandidate).filter_by(
company_id=company.id, candidate_url=url
).first()
if existing:
return {'status': 'exists', 'candidate_id': existing.id}
# Fetch and extract
page_text = _fetch_page_text(url)
extracted = {}
if page_text:
extracted = {
'nips': _find_nips_in_text(page_text),
'regons': _find_regons_in_text(page_text),
'krs': _find_krs_in_text(page_text),
'emails': _extract_emails(page_text),
'phones': _extract_phones(page_text),
'text_snippet': page_text[:500],
}
else:
extracted = {
'nips': [], 'regons': [], 'krs': [],
'emails': [], 'phones': [], 'text_snippet': '',
}
# Compute match signals
signals = self._compute_signals(extracted, company, page_text)
confidence, score = self._compute_confidence(signals)
# Save candidate
candidate = WebsiteDiscoveryCandidate(
company_id=company.id,
search_query=query,
candidate_url=url,
candidate_domain=domain,
brave_title=best.get('title', ''),
brave_description=best.get('description', ''),
extracted_nips=extracted['nips'] or None,
extracted_regons=extracted['regons'] or None,
extracted_krs=extracted['krs'] or None,
extracted_phones=extracted['phones'] or None,
extracted_emails=extracted['emails'] or None,
page_text_snippet=extracted['text_snippet'] or None,
match_nip=signals.get('nip', False),
match_regon=signals.get('regon', False),
match_krs=signals.get('krs', False),
match_phone=signals.get('phone', False),
match_email=signals.get('email', False),
match_city=signals.get('city', False),
match_owner=signals.get('owner', False),
confidence=confidence,
match_score=score,
)
db.add(candidate)
db.commit()
return {
'status': 'found',
'candidate_id': candidate.id,
'url': url,
'confidence': confidence,
'score': score,
'signals': signals,
}
except Exception as e:
db.rollback()
logger.error(f"Discovery error for company {company.id}: {e}")
return {'error': str(e)}
finally:
if own_session:
db.close()
def _search_brave(self, query):
"""Search Brave API, filter directories, return top URLs."""
try:
headers = {
'Accept': 'application/json',
'X-Subscription-Token': self.brave_api_key,
}
params = {
'q': query,
'count': 10,
'country': 'pl',
'search_lang': 'pl',
}
resp = requests.get(
'https://api.search.brave.com/res/v1/web/search',
headers=headers, params=params, timeout=10,
)
if resp.status_code != 200:
logger.warning(f"Brave API returned {resp.status_code}")
return []
data = resp.json()
results = []
for item in data.get('web', {}).get('results', []):
url = item.get('url', '')
if not url or _is_directory_domain(url):
continue
results.append({
'url': url,
'title': item.get('title', ''),
'description': item.get('description', ''),
})
return results[:5]
except Exception as e:
logger.error(f"Brave search error: {e}")
return []
def _compute_signals(self, extracted, company, page_text=None):
"""Compare extracted data with company record."""
signals = {}
# NIP match (weight 3)
if company.nip and extracted.get('nips'):
company_nip = _normalize_nip(company.nip)
signals['nip'] = company_nip in [_normalize_nip(n) for n in extracted['nips']]
else:
signals['nip'] = False
# REGON match (weight 3)
if company.regon and extracted.get('regons'):
company_regon = re.sub(r'[^0-9]', '', company.regon)
signals['regon'] = company_regon in [re.sub(r'[^0-9]', '', r) for r in extracted['regons']]
else:
signals['regon'] = False
# KRS match (weight 3)
if company.krs and extracted.get('krs'):
company_krs = re.sub(r'[^0-9]', '', company.krs)
signals['krs'] = company_krs in [re.sub(r'[^0-9]', '', k) for k in extracted['krs']]
else:
signals['krs'] = False
# Phone match (weight 2) - last 9 digits
if company.phone and extracted.get('phones'):
company_digits = re.sub(r'\D', '', company.phone)[-9:]
signals['phone'] = any(
re.sub(r'\D', '', p)[-9:] == company_digits
for p in extracted['phones']
)
else:
signals['phone'] = False
# Email match (weight 2) - exact or same domain
if company.email and extracted.get('emails'):
company_email = company.email.lower()
company_domain = company_email.split('@')[-1] if '@' in company_email else ''
signals['email'] = any(
e == company_email or (company_domain and e.split('@')[-1] == company_domain)
for e in extracted['emails']
)
else:
signals['email'] = False
# City match (weight 1)
text = page_text or ''
if company.address_city and text:
signals['city'] = company.address_city.lower() in text.lower()
else:
signals['city'] = False
# Owner match (weight 1)
if hasattr(company, 'owner_name') and company.owner_name and text:
signals['owner'] = company.owner_name.lower() in text.lower()
else:
signals['owner'] = False
return signals
def _compute_confidence(self, signals):
"""Compute confidence level and numeric score."""
weights = {
'nip': 3, 'regon': 3, 'krs': 3,
'phone': 2, 'email': 2,
'city': 1, 'owner': 1,
}
score = sum(weights[k] for k, v in signals.items() if v)
if score >= 5:
return 'high', score
elif score >= 2:
return 'medium', score
else:
return 'low', score
def discover_bulk(self, limit=50, delay=2.0):
"""
Bulk discovery for all companies without website.
Returns dict with progress info.
"""
db = SessionLocal()
try:
# Find companies without website
companies = db.query(Company).filter(
Company.status.in_(['active', 'pending']),
(Company.website == None) | (Company.website == ''),
).order_by(Company.name).limit(limit).all()
results = {
'total': len(companies),
'processed': 0,
'found': 0,
'errors': 0,
'details': [],
}
service = WebsiteDiscoveryService(db=db)
for company in companies:
result = service.discover_for_company(company)
results['processed'] += 1
if result.get('status') == 'found':
results['found'] += 1
elif result.get('error'):
results['errors'] += 1
results['details'].append({
'company_id': company.id,
'company_name': company.name,
'result': result,
})
# Rate limit
if results['processed'] < results['total']:
time.sleep(delay)
return results
finally:
db.close()

View File

@ -455,6 +455,25 @@
background: #dbeafe;
}
/* Discovery badges */
.disc-badge {
display: inline-block;
padding: 1px 6px;
border-radius: var(--radius);
font-size: var(--font-size-xs);
font-weight: 500;
}
.disc-badge.disc-match {
background: #dcfce7;
color: #166534;
}
.disc-badge.disc-miss {
background: #f3f4f6;
color: #9ca3af;
}
/* Responsive */
@media (max-width: 768px) {
.dq-bar-label { width: 100px; font-size: var(--font-size-xs); }
@ -602,6 +621,115 @@
</div>
{% endif %}
<!-- Website Discovery Section -->
<div class="dq-section">
<div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: var(--spacing-lg);">
<div class="dq-section-title" style="margin-bottom: 0;">
Odkryte strony WWW
{% if discovery_data %}
<span style="font-size: var(--font-size-sm); color: var(--text-secondary); font-weight: 400;">({{ discovery_data|length }} kandydatów)</span>
{% endif %}
</div>
<div style="display: flex; gap: var(--spacing-sm); align-items: center;">
{% if companies_without_website > 0 %}
<span style="font-size: var(--font-size-xs); color: var(--text-secondary);">{{ companies_without_website }} firm bez WWW</span>
{% endif %}
<button onclick="discoverWebsitesBulk()" class="dq-bulk-btn"
style="background: var(--primary); color: white; padding: var(--spacing-sm) var(--spacing-lg); border-radius: var(--radius);">
Szukaj WWW
</button>
</div>
</div>
{% if discovery_data %}
<table class="dq-table" id="discoveryTable">
<thead>
<tr>
<th style="width: 5px;"></th>
<th>Firma</th>
<th>Strona</th>
<th>Dopasowania</th>
<th style="width: 80px;">Akcja</th>
</tr>
</thead>
<tbody>
{% for d in discovery_data %}
<tr id="disc-row-{{ d.id }}" style="border-left: 3px solid {% if d.confidence == 'high' %}#22c55e{% elif d.confidence == 'medium' %}#f59e0b{% else %}#d1d5db{% endif %};">
<td></td>
<td>
<a href="{{ url_for('admin.admin_company_detail', company_id=d.company_id) }}" class="dq-company-link">{{ d.company_name }}</a>
</td>
<td>
<a href="{{ d.url }}" target="_blank" style="color: var(--primary); text-decoration: none; font-weight: 500;">{{ d.domain }}</a>
{% if d.title %}
<br><span style="font-size: var(--font-size-xs); color: var(--text-secondary);">{{ d.title[:60] }}</span>
{% endif %}
</td>
<td>
<div style="display: flex; gap: 4px; flex-wrap: wrap;">
{% if d.has_nip %}
<span class="disc-badge {% if d.match_nip %}disc-match{% else %}disc-miss{% endif %}">NIP</span>
{% endif %}
{% if d.has_regon %}
<span class="disc-badge {% if d.match_regon %}disc-match{% else %}disc-miss{% endif %}">REGON</span>
{% endif %}
{% if d.has_krs %}
<span class="disc-badge {% if d.match_krs %}disc-match{% else %}disc-miss{% endif %}">KRS</span>
{% endif %}
{% if d.has_phone %}
<span class="disc-badge {% if d.match_phone %}disc-match{% else %}disc-miss{% endif %}">Tel</span>
{% endif %}
{% if d.has_email %}
<span class="disc-badge {% if d.match_email %}disc-match{% else %}disc-miss{% endif %}">Email</span>
{% endif %}
{% if d.has_city %}
<span class="disc-badge {% if d.match_city %}disc-match{% else %}disc-miss{% endif %}">Miasto</span>
{% endif %}
{% if d.has_owner %}
<span class="disc-badge {% if d.match_owner %}disc-match{% else %}disc-miss{% endif %}">Właściciel</span>
{% endif %}
</div>
</td>
<td>
<div style="display: flex; gap: 4px;">
<button class="dq-action-btn" style="background: #dcfce7; border-color: #86efac; color: #166534;" title="Zatwierdź" onclick="acceptDiscovery({{ d.id }}, 'disc-row-{{ d.id }}')">
<svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="3"><path d="M5 13l4 4L19 7"/></svg>
</button>
<button class="dq-action-btn" style="background: #fee2e2; border-color: #fca5a5; color: #991b1b;" title="Odrzuć" onclick="rejectDiscovery({{ d.id }}, 'disc-row-{{ d.id }}')">
<svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="3"><path d="M6 18L18 6M6 6l12 12"/></svg>
</button>
</div>
</td>
</tr>
{% endfor %}
</tbody>
</table>
{% else %}
<p style="color: var(--text-secondary); font-size: var(--font-size-sm);">Brak kandydatów. Kliknij "Szukaj WWW" aby uruchomić wyszukiwanie.</p>
{% endif %}
</div>
<!-- Bulk Discovery Modal -->
<div id="discoveryModal" style="display: none; position: fixed; top: 0; left: 0; right: 0; bottom: 0; background: rgba(0,0,0,0.5); z-index: 9999; align-items: center; justify-content: center;">
<div style="background: var(--surface); border-radius: var(--radius-xl); padding: var(--spacing-xl); max-width: 500px; width: 90%; max-height: 80vh; overflow-y: auto;">
<h3 style="margin-bottom: var(--spacing-lg);">Wyszukiwanie stron WWW</h3>
<p style="color: var(--text-secondary); margin-bottom: var(--spacing-lg);">
Szukam stron internetowych dla firm bez uzupełnionej strony WWW...
</p>
<div style="display: flex; justify-content: space-between; margin-bottom: var(--spacing-sm);">
<span style="font-weight: 600;">Postęp</span>
<span id="discProgressText">0/0</span>
</div>
<div style="height: 8px; background: var(--background); border-radius: 4px; overflow: hidden;">
<div id="discProgressBar" style="height: 100%; background: var(--primary); border-radius: 4px; transition: width 0.3s; width: 0%;"></div>
</div>
<div id="discProgressLog" style="margin-top: var(--spacing-md); max-height: 250px; overflow-y: auto; font-size: var(--font-size-xs); font-family: monospace; color: var(--text-secondary);"></div>
<div style="margin-top: var(--spacing-lg); text-align: right;">
<button id="discCloseBtn" onclick="closeDiscoveryModal()" style="display: none; padding: var(--spacing-sm) var(--spacing-lg); border: 1px solid var(--border); border-radius: var(--radius); background: var(--surface); cursor: pointer;">Zamknij</button>
</div>
</div>
</div>
<!-- Companies Table -->
<div class="dq-section">
<div class="dq-section-title">Firmy wg kompletności danych</div>
@ -1037,4 +1165,123 @@
if (btn && !btn.disabled) btn.click();
});
}
// --- Website Discovery ---
function discoverWebsite(companyId, btn) {
if (btn.disabled) return;
var originalHTML = btn.innerHTML;
btn.disabled = true;
btn.innerHTML = '...';
var csrf = document.querySelector('meta[name=csrf-token]')?.content || '';
fetch('/admin/discover-website/' + companyId, {
method: 'POST',
headers: {'Content-Type': 'application/json', 'X-CSRFToken': csrf}
})
.then(function(r) { return r.json(); })
.then(function(data) {
if (data.success && data.status === 'found') {
btn.innerHTML = '<svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="#22c55e" stroke-width="3"><path d="M5 13l4 4L19 7"/></svg>';
btn.title = 'Znaleziono: ' + (data.url || '');
} else {
btn.innerHTML = originalHTML;
btn.disabled = false;
btn.title = data.error || 'Brak wyników';
}
})
.catch(function(err) {
btn.innerHTML = originalHTML;
btn.disabled = false;
btn.title = 'Błąd: ' + err.message;
});
}
function discoverWebsitesBulk() {
document.getElementById('discoveryModal').style.display = 'flex';
document.getElementById('discCloseBtn').style.display = 'none';
document.getElementById('discProgressBar').style.width = '0%';
document.getElementById('discProgressText').textContent = 'Uruchamiam...';
document.getElementById('discProgressLog').innerHTML = '';
var csrf = document.querySelector('meta[name=csrf-token]')?.content || '';
fetch('/admin/discover-websites-bulk', {
method: 'POST',
headers: {'Content-Type': 'application/json', 'X-CSRFToken': csrf}
})
.then(function(r) { return r.json(); })
.then(function(data) {
if (data.job_id) {
pollDiscoveryProgress(data.job_id);
}
})
.catch(function(err) {
document.getElementById('discProgressLog').innerHTML = '<div style="color: #ef4444;">Błąd: ' + err.message + '</div>';
document.getElementById('discCloseBtn').style.display = 'inline-block';
});
}
function pollDiscoveryProgress(jobId) {
fetch('/admin/discover-websites-status?job_id=' + jobId)
.then(function(r) { return r.json(); })
.then(function(data) {
var total = data.total || 1;
var processed = data.processed || 0;
var pct = Math.round(processed / total * 100);
document.getElementById('discProgressBar').style.width = pct + '%';
document.getElementById('discProgressText').textContent = processed + '/' + total;
if (data.latest_result) {
var log = document.getElementById('discProgressLog');
log.innerHTML += '<div>' + data.latest_result + '</div>';
log.scrollTop = log.scrollHeight;
}
if (data.status === 'running') {
setTimeout(function() { pollDiscoveryProgress(jobId); }, 3000);
} else {
document.getElementById('discProgressLog').innerHTML += '<div style="color: #22c55e; font-weight: 600;">Zakończono! Odśwież stronę aby zobaczyć wyniki.</div>';
document.getElementById('discCloseBtn').style.display = 'inline-block';
}
});
}
function closeDiscoveryModal() {
document.getElementById('discoveryModal').style.display = 'none';
}
function acceptDiscovery(candidateId, rowId) {
var csrf = document.querySelector('meta[name=csrf-token]')?.content || '';
fetch('/admin/discovery/' + candidateId + '/accept', {
method: 'POST',
headers: {'Content-Type': 'application/json', 'X-CSRFToken': csrf}
})
.then(function(r) { return r.json(); })
.then(function(data) {
var row = document.getElementById(rowId);
if (data.success) {
if (row) row.style.opacity = '0.3';
} else {
alert('Błąd: ' + (data.error || 'nieznany'));
}
})
.catch(function(err) { alert('Błąd: ' + err.message); });
}
function rejectDiscovery(candidateId, rowId) {
var csrf = document.querySelector('meta[name=csrf-token]')?.content || '';
fetch('/admin/discovery/' + candidateId + '/reject', {
method: 'POST',
headers: {'Content-Type': 'application/json', 'X-CSRFToken': csrf}
})
.then(function(r) { return r.json(); })
.then(function(data) {
var row = document.getElementById(rowId);
if (data.success) {
if (row) row.remove();
} else {
alert('Błąd: ' + (data.error || 'nieznany'));
}
})
.catch(function(err) { alert('Błąd: ' + err.message); });
}
{% endblock %}