feat: add website discovery service for companies without websites

Automated discovery using Brave Search API to find company websites, scrape verification data (NIP/REGON/KRS/email/phone), and present candidates with match badges in the data quality dashboard. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-21 08:27:13 +01:00 · 2026-02-21 08:27:13 +01:00 · 126eff8af6
commit 126eff8af6
parent 01bc40132e
7 changed files with 1021 additions and 1 deletions
--- a/blueprints/admin/init.py
+++ b/blueprints/admin/init.py
@ -31,3 +31,4 @@ from . import routes_competitors  # noqa: E402, F401
 from . import routes_social_publisher  # noqa: E402, F401
 from . import routes_data_quality  # noqa: E402, F401
 from . import routes_bulk_enrichment  # noqa: E402, F401
+from . import routes_website_discovery  # noqa: E402, F401
--- a/blueprints/admin/routes_data_quality.py
+++ b/blueprints/admin/routes_data_quality.py
@ -16,7 +16,8 @@ from sqlalchemy import func
 from . import bp
 from database import (
    SessionLocal, Company, CompanyWebsiteAnalysis,
-    CompanySocialMedia, GBPAudit, SystemRole
+    CompanySocialMedia, GBPAudit, SystemRole,
+    WebsiteDiscoveryCandidate
 )
 from utils.decorators import role_required
 from utils.data_quality import compute_weighted_score
@ -212,6 +213,48 @@ def admin_data_quality():
                    'google_name': g_name,
                })

+        # Website discovery candidates (pending)
+        discovery_candidates = db.query(WebsiteDiscoveryCandidate).filter(
+            WebsiteDiscoveryCandidate.status == 'pending',
+            WebsiteDiscoveryCandidate.candidate_url != 'none',
+        ).order_by(WebsiteDiscoveryCandidate.match_score.desc()).all()
+
+        # Enrich with company name
+        discovery_data = []
+        for dc in discovery_candidates:
+            comp = company_map.get(dc.company_id)
+            if not comp:
+                continue
+            discovery_data.append({
+                'id': dc.id,
+                'company_id': dc.company_id,
+                'company_name': comp.name,
+                'company_slug': comp.slug,
+                'url': dc.candidate_url,
+                'domain': dc.candidate_domain or '',
+                'title': dc.brave_title or '',
+                'description': (dc.brave_description or '')[:100],
+                'match_nip': dc.match_nip,
+                'match_regon': dc.match_regon,
+                'match_krs': dc.match_krs,
+                'match_phone': dc.match_phone,
+                'match_email': dc.match_email,
+                'match_city': dc.match_city,
+                'match_owner': dc.match_owner,
+                'confidence': dc.confidence,
+                'score': dc.match_score,
+                'has_nip': bool(comp.nip),
+                'has_regon': bool(comp.regon),
+                'has_krs': bool(comp.krs),
+                'has_phone': bool(comp.phone),
+                'has_email': bool(comp.email),
+                'has_city': bool(comp.address_city),
+                'has_owner': bool(getattr(comp, 'owner_name', None)),
+            })
+
+        # Count companies without website
+        companies_without_website = sum(1 for c in companies_table if not c['website'])
+
        return render_template(
            'admin/data_quality_dashboard.html',
            total=total,
@ -221,6 +264,8 @@ def admin_data_quality():
            avg_score=avg_score,
            companies_table=companies_table,
            available_data=available_data,
+            discovery_data=discovery_data,
+            companies_without_website=companies_without_website,
            now=now,
        )
    finally:
--- a/blueprints/admin/routes_website_discovery.py
+++ b/blueprints/admin/routes_website_discovery.py
@ -0,0 +1,167 @@
+"""
+Admin Website Discovery Routes
+================================
+
+Endpoints for discovering and managing website candidates for companies.
+"""
+
+import logging
+import threading
+from datetime import datetime
+
+from flask import request, jsonify
+from flask_login import login_required
+
+from . import bp
+from database import SessionLocal, Company, WebsiteDiscoveryCandidate, SystemRole
+from utils.decorators import role_required
+from utils.data_quality import update_company_data_quality
+from services.website_discovery_service import WebsiteDiscoveryService
+
+logger = logging.getLogger(__name__)
+
+# Store bulk job progress
+_bulk_jobs = {}
+
+
+@bp.route('/discover-website/<int:company_id>', methods=['POST'])
+@login_required
+@role_required(SystemRole.ADMIN)
+def discover_website(company_id):
+    """Discover website for a single company."""
+    db = SessionLocal()
+    try:
+        company = db.query(Company).get(company_id)
+        if not company:
+            return jsonify({'error': 'Firma nie znaleziona'}), 404
+
+        service = WebsiteDiscoveryService(db=db)
+        result = service.discover_for_company(company)
+
+        if result.get('error'):
+            return jsonify({'success': False, 'error': result['error']})
+
+        return jsonify({'success': True, **result})
+    except Exception as e:
+        logger.error(f"Discovery error: {e}")
+        return jsonify({'success': False, 'error': str(e)}), 500
+    finally:
+        db.close()
+
+
+@bp.route('/discover-websites-bulk', methods=['POST'])
+@login_required
+@role_required(SystemRole.ADMIN)
+def discover_websites_bulk():
+    """Start bulk website discovery in background."""
+    import uuid
+    job_id = str(uuid.uuid4())[:8]
+
+    def run_bulk(job_id):
+        _bulk_jobs[job_id] = {'status': 'running', 'processed': 0, 'total': 0, 'latest_result': ''}
+        db = SessionLocal()
+        try:
+            companies = db.query(Company).filter(
+                Company.status.in_(['active', 'pending']),
+                (Company.website == None) | (Company.website == ''),
+            ).order_by(Company.name).limit(50).all()
+
+            _bulk_jobs[job_id]['total'] = len(companies)
+            service = WebsiteDiscoveryService(db=db)
+
+            import time
+            for company in companies:
+                result = service.discover_for_company(company)
+                _bulk_jobs[job_id]['processed'] += 1
+
+                status_text = f"{company.name}: "
+                if result.get('status') == 'found':
+                    status_text += f"znaleziono {result.get('url', '?')} ({result.get('confidence', '?')})"
+                elif result.get('status') == 'exists':
+                    status_text += "kandydat już istnieje"
+                else:
+                    status_text += result.get('error', 'błąd')
+                _bulk_jobs[job_id]['latest_result'] = status_text
+
+                if _bulk_jobs[job_id]['processed'] < _bulk_jobs[job_id]['total']:
+                    time.sleep(2)
+
+            _bulk_jobs[job_id]['status'] = 'completed'
+        except Exception as e:
+            logger.error(f"Bulk discovery error: {e}")
+            _bulk_jobs[job_id]['status'] = 'error'
+            _bulk_jobs[job_id]['latest_result'] = str(e)
+        finally:
+            db.close()
+
+    thread = threading.Thread(target=run_bulk, args=(job_id,), daemon=True)
+    thread.start()
+
+    return jsonify({'success': True, 'job_id': job_id})
+
+
+@bp.route('/discover-websites-status')
+@login_required
+@role_required(SystemRole.ADMIN)
+def discover_websites_status():
+    """Poll bulk discovery progress."""
+    job_id = request.args.get('job_id')
+    if not job_id or job_id not in _bulk_jobs:
+        return jsonify({'error': 'Job not found'}), 404
+    return jsonify(_bulk_jobs[job_id])
+
+
+@bp.route('/discovery/<int:candidate_id>/accept', methods=['POST'])
+@login_required
+@role_required(SystemRole.ADMIN)
+def accept_discovery(candidate_id):
+    """Accept a discovery candidate - set company.website."""
+    db = SessionLocal()
+    try:
+        candidate = db.query(WebsiteDiscoveryCandidate).get(candidate_id)
+        if not candidate:
+            return jsonify({'error': 'Kandydat nie znaleziony'}), 404
+
+        company = db.query(Company).get(candidate.company_id)
+        if not company:
+            return jsonify({'error': 'Firma nie znaleziona'}), 404
+
+        # Set website
+        company.website = candidate.candidate_url
+        candidate.status = 'accepted'
+        candidate.reviewed_at = datetime.now()
+
+        # Update data quality
+        update_company_data_quality(company, db)
+
+        db.commit()
+        logger.info(f"Accepted website {candidate.candidate_url} for company {company.name}")
+        return jsonify({'success': True, 'url': candidate.candidate_url})
+    except Exception as e:
+        db.rollback()
+        logger.error(f"Accept error: {e}")
+        return jsonify({'error': str(e)}), 500
+    finally:
+        db.close()
+
+
+@bp.route('/discovery/<int:candidate_id>/reject', methods=['POST'])
+@login_required
+@role_required(SystemRole.ADMIN)
+def reject_discovery(candidate_id):
+    """Reject a discovery candidate."""
+    db = SessionLocal()
+    try:
+        candidate = db.query(WebsiteDiscoveryCandidate).get(candidate_id)
+        if not candidate:
+            return jsonify({'error': 'Kandydat nie znaleziony'}), 404
+
+        candidate.status = 'rejected'
+        candidate.reviewed_at = datetime.now()
+        db.commit()
+        return jsonify({'success': True})
+    except Exception as e:
+        db.rollback()
+        return jsonify({'error': str(e)}), 500
+    finally:
+        db.close()
--- a/database.py
+++ b/database.py
@ -5463,6 +5463,47 @@ class SocialMediaConfig(Base):
        return f'<SocialMediaConfig {self.platform} company_id={self.company_id} page={self.page_name}>'


+class WebsiteDiscoveryCandidate(Base):
+    """Website candidates found via Brave Search for companies missing website field."""
+    __tablename__ = 'website_discovery_candidates'
+
+    id = Column(Integer, primary_key=True)
+    company_id = Column(Integer, ForeignKey('companies.id'), nullable=False)
+    discovered_at = Column(DateTime, default=datetime.now)
+    search_query = Column(Text)
+    candidate_url = Column(String(500), nullable=False)
+    candidate_domain = Column(String(255))
+    brave_title = Column(Text)
+    brave_description = Column(Text)
+    extracted_nips = Column(PG_ARRAY(Text))
+    extracted_regons = Column(PG_ARRAY(Text))
+    extracted_krs = Column(PG_ARRAY(Text))
+    extracted_phones = Column(PG_ARRAY(Text))
+    extracted_emails = Column(PG_ARRAY(Text))
+    page_text_snippet = Column(Text)
+    match_nip = Column(Boolean, default=False)
+    match_regon = Column(Boolean, default=False)
+    match_krs = Column(Boolean, default=False)
+    match_phone = Column(Boolean, default=False)
+    match_email = Column(Boolean, default=False)
+    match_city = Column(Boolean, default=False)
+    match_owner = Column(Boolean, default=False)
+    confidence = Column(String(10), default='low')
+    match_score = Column(Integer, default=0)
+    status = Column(String(20), default='pending')
+    reviewed_at = Column(DateTime)
+    error_message = Column(Text)
+
+    company = relationship('Company', foreign_keys=[company_id])
+
+    __table_args__ = (
+        UniqueConstraint('company_id', 'candidate_url', name='uq_wdc_company_url'),
+    )
+
+    def __repr__(self):
+        return f'<WebsiteDiscoveryCandidate {self.id} company={self.company_id} confidence={self.confidence}>'
+
+
 # ============================================================
 # DATABASE INITIALIZATION
 # ============================================================
--- a/database/migrations/076_website_discovery_candidates.sql
+++ b/database/migrations/076_website_discovery_candidates.sql
@ -0,0 +1,38 @@
+-- Website Discovery Candidates
+-- Stores website candidates found via Brave Search for companies missing website field
+
+CREATE TABLE IF NOT EXISTS website_discovery_candidates (
+    id SERIAL PRIMARY KEY,
+    company_id INTEGER NOT NULL REFERENCES companies(id) ON DELETE CASCADE,
+    discovered_at TIMESTAMP DEFAULT NOW(),
+    search_query TEXT,
+    candidate_url VARCHAR(500) NOT NULL,
+    candidate_domain VARCHAR(255),
+    brave_title TEXT,
+    brave_description TEXT,
+    extracted_nips TEXT[],
+    extracted_regons TEXT[],
+    extracted_krs TEXT[],
+    extracted_phones TEXT[],
+    extracted_emails TEXT[],
+    page_text_snippet TEXT,
+    match_nip BOOLEAN DEFAULT FALSE,
+    match_regon BOOLEAN DEFAULT FALSE,
+    match_krs BOOLEAN DEFAULT FALSE,
+    match_phone BOOLEAN DEFAULT FALSE,
+    match_email BOOLEAN DEFAULT FALSE,
+    match_city BOOLEAN DEFAULT FALSE,
+    match_owner BOOLEAN DEFAULT FALSE,
+    confidence VARCHAR(10) DEFAULT 'low',
+    match_score INTEGER DEFAULT 0,
+    status VARCHAR(20) DEFAULT 'pending',
+    reviewed_at TIMESTAMP,
+    error_message TEXT,
+    UNIQUE(company_id, candidate_url)
+);
+
+CREATE INDEX IF NOT EXISTS idx_wdc_status ON website_discovery_candidates(status);
+CREATE INDEX IF NOT EXISTS idx_wdc_company ON website_discovery_candidates(company_id);
+
+GRANT ALL ON TABLE website_discovery_candidates TO nordabiz_app;
+GRANT USAGE, SELECT ON SEQUENCE website_discovery_candidates_id_seq TO nordabiz_app;
--- a/services/website_discovery_service.py
+++ b/services/website_discovery_service.py
@ -0,0 +1,481 @@
+"""
+Website Discovery Service
+==========================
+
+Discovers websites for companies that don't have one registered.
+Uses Brave Web Search API to find candidates, scrapes them for verification data,
+and compares extracted information against known company data.
+"""
+
+import os
+import re
+import time
+import logging
+from urllib.parse import urlparse
+from datetime import datetime
+
+import requests
+from bs4 import BeautifulSoup
+
+from database import SessionLocal, Company, WebsiteDiscoveryCandidate
+from utils.data_quality import update_company_data_quality
+
+logger = logging.getLogger(__name__)
+
+# Domains to skip - business directories, social media, own portal
+DIRECTORY_DOMAINS = {
+    'panoramafirm.pl', 'aleo.com', 'rejestr.io', 'krs-pobierz.pl',
+    'gowork.pl', 'oferteo.pl', 'pkt.pl', 'firmy.net', 'zumi.pl',
+    'baza-firm.com.pl', 'e-krs.pl', 'krs-online.com.pl', 'regon.info',
+    'infoveriti.pl', 'companywall.pl', 'findcompany.pl', 'owg.pl',
+    'facebook.com', 'linkedin.com', 'youtube.com', 'instagram.com',
+    'twitter.com', 'x.com', 'tiktok.com',
+    'nordabiznes.pl', 'google.com', 'google.pl',
+    'wikipedia.org', 'olx.pl', 'allegro.pl',
+}
+
+
+# --- Extraction helpers ---
+
+def _normalize_nip(nip):
+    return re.sub(r'[^0-9]', '', nip)
+
+
+def _validate_nip(nip):
+    nip = _normalize_nip(nip)
+    if len(nip) != 10:
+        return False
+    weights = [6, 5, 7, 2, 3, 4, 5, 6, 7]
+    try:
+        checksum = sum(int(nip[i]) * weights[i] for i in range(9)) % 11
+        return checksum == int(nip[9])
+    except (ValueError, IndexError):
+        return False
+
+
+def _validate_regon(regon):
+    regon = re.sub(r'[^0-9]', '', regon)
+    if len(regon) == 9:
+        weights = [8, 9, 2, 3, 4, 5, 6, 7]
+        checksum = sum(int(regon[i]) * weights[i] for i in range(8)) % 11
+        if checksum == 10:
+            checksum = 0
+        return checksum == int(regon[8])
+    elif len(regon) == 14:
+        weights9 = [8, 9, 2, 3, 4, 5, 6, 7]
+        checksum9 = sum(int(regon[i]) * weights9[i] for i in range(8)) % 11
+        if checksum9 == 10:
+            checksum9 = 0
+        if checksum9 != int(regon[8]):
+            return False
+        weights14 = [2, 4, 8, 5, 0, 9, 7, 3, 6, 1, 2, 4, 8]
+        checksum14 = sum(int(regon[i]) * weights14[i] for i in range(13)) % 11
+        if checksum14 == 10:
+            checksum14 = 0
+        return checksum14 == int(regon[13])
+    return False
+
+
+def _find_nips_in_text(text):
+    patterns = [
+        r'NIP[:\s]*(\d{3}[-\s]?\d{3}[-\s]?\d{2}[-\s]?\d{2})',
+        r'NIP[:\s]*(\d{10})',
+        r'numer\s+identyfikacji\s+podatkowej[:\s]*(\d{10})',
+    ]
+    nips = []
+    for pattern in patterns:
+        matches = re.findall(pattern, text, re.IGNORECASE)
+        for match in matches:
+            nip = _normalize_nip(match)
+            if _validate_nip(nip) and nip not in nips:
+                nips.append(nip)
+    return nips
+
+
+def _find_regons_in_text(text):
+    patterns = [
+        r'REGON[:\s]*(\d{9,14})',
+        r'rejestr\s+gospodarczy[:\s]*(\d{9,14})',
+    ]
+    regons = []
+    for pattern in patterns:
+        matches = re.findall(pattern, text, re.IGNORECASE)
+        for match in matches:
+            regon = re.sub(r'[^0-9]', '', match)
+            if _validate_regon(regon) and regon not in regons:
+                regons.append(regon)
+    return regons
+
+
+def _find_krs_in_text(text):
+    patterns = [
+        r'KRS[:\s]*(\d{10})',
+        r'Krajow\w+\s+Rejestr\w*\s+S[aą]dow\w*[:\s]*(\d{10})',
+    ]
+    krs_numbers = []
+    for pattern in patterns:
+        matches = re.findall(pattern, text, re.IGNORECASE)
+        for match in matches:
+            krs = re.sub(r'[^0-9]', '', match)
+            if len(krs) == 10 and krs not in krs_numbers:
+                krs_numbers.append(krs)
+    return krs_numbers
+
+
+def _extract_emails(text):
+    email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
+    emails = re.findall(email_pattern, text)
+    skip_domains = {
+        'example.com', 'sentry.io', 'wixpress.com', 'wordpress.org',
+        'w3.org', 'schema.org', 'googleapis.com',
+    }
+    return list(dict.fromkeys(
+        e.lower() for e in emails
+        if not any(d in e.lower() for d in skip_domains)
+    ))[:5]
+
+
+def _extract_phones(text):
+    phone_patterns = [
+        r'(?:\+48\s?)?\d{2}[\s-]?\d{3}[\s-]?\d{2}[\s-]?\d{2}',
+        r'(?:\+48\s?)?\d{3}[\s-]?\d{3}[\s-]?\d{3}',
+        r'\(\d{2}\)\s?\d{3}[\s-]?\d{2}[\s-]?\d{2}',
+        r'(?:tel|phone|telefon)[.:]\s*[\+]?\d[\d\s\-]{7,14}',
+    ]
+    phones = []
+    for pattern in phone_patterns:
+        matches = re.findall(pattern, text, re.IGNORECASE)
+        for m in matches:
+            clean = re.sub(r'(?:tel|phone|telefon)[.:]?\s*', '', m, flags=re.IGNORECASE).strip()
+            digits = re.sub(r'\D', '', clean)
+            if 9 <= len(digits) <= 12:
+                phones.append(clean)
+    return list(dict.fromkeys(phones))[:5]
+
+
+def _is_directory_domain(url):
+    """Check if URL belongs to a known business directory."""
+    try:
+        domain = urlparse(url).netloc.lower()
+        # Strip www.
+        if domain.startswith('www.'):
+            domain = domain[4:]
+        return any(domain == d or domain.endswith('.' + d) for d in DIRECTORY_DOMAINS)
+    except Exception:
+        return False
+
+
+def _fetch_page_text(url, timeout=15):
+    """Fetch URL and return plain text content."""
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (compatible; NordaBizBot/1.0)',
+        'Accept': 'text/html,application/xhtml+xml',
+    }
+    try:
+        resp = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True)
+        content_type = resp.headers.get('Content-Type', '')
+        if 'text/html' not in content_type and 'application/xhtml' not in content_type:
+            return None
+        resp.encoding = resp.apparent_encoding or 'utf-8'
+        soup = BeautifulSoup(resp.text, 'html.parser')
+        # Remove non-content elements
+        for tag in soup.find_all(['script', 'style', 'nav', 'footer', 'header', 'noscript']):
+            tag.decompose()
+        text = soup.get_text(separator=' ', strip=True)
+        return text
+    except requests.exceptions.SSLError:
+        # Retry with HTTP
+        try:
+            http_url = url.replace('https://', 'http://')
+            resp = requests.get(http_url, headers=headers, timeout=timeout, allow_redirects=True)
+            resp.encoding = resp.apparent_encoding or 'utf-8'
+            soup = BeautifulSoup(resp.text, 'html.parser')
+            for tag in soup.find_all(['script', 'style', 'nav', 'footer', 'header', 'noscript']):
+                tag.decompose()
+            return soup.get_text(separator=' ', strip=True)
+        except Exception:
+            return None
+    except Exception as e:
+        logger.warning(f"Failed to fetch {url}: {e}")
+        return None
+
+
+class WebsiteDiscoveryService:
+    """Discovers and validates website candidates for companies."""
+
+    def __init__(self, db=None):
+        self.db = db
+        self.brave_api_key = os.getenv('BRAVE_API_KEY')
+
+    def discover_for_company(self, company):
+        """
+        Search for website, scrape, compare, save candidate.
+        Returns dict with result info.
+        """
+        if not self.brave_api_key:
+            return {'error': 'BRAVE_API_KEY not configured'}
+
+        db = self.db or SessionLocal()
+        own_session = self.db is None
+        try:
+            # Build search query
+            city = company.address_city or ''
+            query = f'"{company.name}" {city}'.strip()
+
+            # Search Brave
+            urls = self._search_brave(query)
+            if not urls:
+                # Save error candidate
+                candidate = WebsiteDiscoveryCandidate(
+                    company_id=company.id,
+                    search_query=query,
+                    candidate_url='none',
+                    status='error',
+                    error_message='Brak wyników wyszukiwania',
+                )
+                try:
+                    db.add(candidate)
+                    db.commit()
+                except Exception:
+                    db.rollback()
+                return {'error': 'Brak wyników', 'company_id': company.id}
+
+            # Take best candidate (first non-directory URL)
+            best = urls[0]
+            url = best['url']
+            domain = urlparse(url).netloc.lower()
+            if domain.startswith('www.'):
+                domain = domain[4:]
+
+            # Check for existing candidate
+            existing = db.query(WebsiteDiscoveryCandidate).filter_by(
+                company_id=company.id, candidate_url=url
+            ).first()
+            if existing:
+                return {'status': 'exists', 'candidate_id': existing.id}
+
+            # Fetch and extract
+            page_text = _fetch_page_text(url)
+
+            extracted = {}
+            if page_text:
+                extracted = {
+                    'nips': _find_nips_in_text(page_text),
+                    'regons': _find_regons_in_text(page_text),
+                    'krs': _find_krs_in_text(page_text),
+                    'emails': _extract_emails(page_text),
+                    'phones': _extract_phones(page_text),
+                    'text_snippet': page_text[:500],
+                }
+            else:
+                extracted = {
+                    'nips': [], 'regons': [], 'krs': [],
+                    'emails': [], 'phones': [], 'text_snippet': '',
+                }
+
+            # Compute match signals
+            signals = self._compute_signals(extracted, company, page_text)
+            confidence, score = self._compute_confidence(signals)
+
+            # Save candidate
+            candidate = WebsiteDiscoveryCandidate(
+                company_id=company.id,
+                search_query=query,
+                candidate_url=url,
+                candidate_domain=domain,
+                brave_title=best.get('title', ''),
+                brave_description=best.get('description', ''),
+                extracted_nips=extracted['nips'] or None,
+                extracted_regons=extracted['regons'] or None,
+                extracted_krs=extracted['krs'] or None,
+                extracted_phones=extracted['phones'] or None,
+                extracted_emails=extracted['emails'] or None,
+                page_text_snippet=extracted['text_snippet'] or None,
+                match_nip=signals.get('nip', False),
+                match_regon=signals.get('regon', False),
+                match_krs=signals.get('krs', False),
+                match_phone=signals.get('phone', False),
+                match_email=signals.get('email', False),
+                match_city=signals.get('city', False),
+                match_owner=signals.get('owner', False),
+                confidence=confidence,
+                match_score=score,
+            )
+            db.add(candidate)
+            db.commit()
+
+            return {
+                'status': 'found',
+                'candidate_id': candidate.id,
+                'url': url,
+                'confidence': confidence,
+                'score': score,
+                'signals': signals,
+            }
+        except Exception as e:
+            db.rollback()
+            logger.error(f"Discovery error for company {company.id}: {e}")
+            return {'error': str(e)}
+        finally:
+            if own_session:
+                db.close()
+
+    def _search_brave(self, query):
+        """Search Brave API, filter directories, return top URLs."""
+        try:
+            headers = {
+                'Accept': 'application/json',
+                'X-Subscription-Token': self.brave_api_key,
+            }
+            params = {
+                'q': query,
+                'count': 10,
+                'country': 'pl',
+                'search_lang': 'pl',
+            }
+            resp = requests.get(
+                'https://api.search.brave.com/res/v1/web/search',
+                headers=headers, params=params, timeout=10,
+            )
+            if resp.status_code != 200:
+                logger.warning(f"Brave API returned {resp.status_code}")
+                return []
+
+            data = resp.json()
+            results = []
+            for item in data.get('web', {}).get('results', []):
+                url = item.get('url', '')
+                if not url or _is_directory_domain(url):
+                    continue
+                results.append({
+                    'url': url,
+                    'title': item.get('title', ''),
+                    'description': item.get('description', ''),
+                })
+            return results[:5]
+        except Exception as e:
+            logger.error(f"Brave search error: {e}")
+            return []
+
+    def _compute_signals(self, extracted, company, page_text=None):
+        """Compare extracted data with company record."""
+        signals = {}
+
+        # NIP match (weight 3)
+        if company.nip and extracted.get('nips'):
+            company_nip = _normalize_nip(company.nip)
+            signals['nip'] = company_nip in [_normalize_nip(n) for n in extracted['nips']]
+        else:
+            signals['nip'] = False
+
+        # REGON match (weight 3)
+        if company.regon and extracted.get('regons'):
+            company_regon = re.sub(r'[^0-9]', '', company.regon)
+            signals['regon'] = company_regon in [re.sub(r'[^0-9]', '', r) for r in extracted['regons']]
+        else:
+            signals['regon'] = False
+
+        # KRS match (weight 3)
+        if company.krs and extracted.get('krs'):
+            company_krs = re.sub(r'[^0-9]', '', company.krs)
+            signals['krs'] = company_krs in [re.sub(r'[^0-9]', '', k) for k in extracted['krs']]
+        else:
+            signals['krs'] = False
+
+        # Phone match (weight 2) - last 9 digits
+        if company.phone and extracted.get('phones'):
+            company_digits = re.sub(r'\D', '', company.phone)[-9:]
+            signals['phone'] = any(
+                re.sub(r'\D', '', p)[-9:] == company_digits
+                for p in extracted['phones']
+            )
+        else:
+            signals['phone'] = False
+
+        # Email match (weight 2) - exact or same domain
+        if company.email and extracted.get('emails'):
+            company_email = company.email.lower()
+            company_domain = company_email.split('@')[-1] if '@' in company_email else ''
+            signals['email'] = any(
+                e == company_email or (company_domain and e.split('@')[-1] == company_domain)
+                for e in extracted['emails']
+            )
+        else:
+            signals['email'] = False
+
+        # City match (weight 1)
+        text = page_text or ''
+        if company.address_city and text:
+            signals['city'] = company.address_city.lower() in text.lower()
+        else:
+            signals['city'] = False
+
+        # Owner match (weight 1)
+        if hasattr(company, 'owner_name') and company.owner_name and text:
+            signals['owner'] = company.owner_name.lower() in text.lower()
+        else:
+            signals['owner'] = False
+
+        return signals
+
+    def _compute_confidence(self, signals):
+        """Compute confidence level and numeric score."""
+        weights = {
+            'nip': 3, 'regon': 3, 'krs': 3,
+            'phone': 2, 'email': 2,
+            'city': 1, 'owner': 1,
+        }
+        score = sum(weights[k] for k, v in signals.items() if v)
+
+        if score >= 5:
+            return 'high', score
+        elif score >= 2:
+            return 'medium', score
+        else:
+            return 'low', score
+
+    def discover_bulk(self, limit=50, delay=2.0):
+        """
+        Bulk discovery for all companies without website.
+        Returns dict with progress info.
+        """
+        db = SessionLocal()
+        try:
+            # Find companies without website
+            companies = db.query(Company).filter(
+                Company.status.in_(['active', 'pending']),
+                (Company.website == None) | (Company.website == ''),
+            ).order_by(Company.name).limit(limit).all()
+
+            results = {
+                'total': len(companies),
+                'processed': 0,
+                'found': 0,
+                'errors': 0,
+                'details': [],
+            }
+
+            service = WebsiteDiscoveryService(db=db)
+
+            for company in companies:
+                result = service.discover_for_company(company)
+                results['processed'] += 1
+
+                if result.get('status') == 'found':
+                    results['found'] += 1
+                elif result.get('error'):
+                    results['errors'] += 1
+
+                results['details'].append({
+                    'company_id': company.id,
+                    'company_name': company.name,
+                    'result': result,
+                })
+
+                # Rate limit
+                if results['processed'] < results['total']:
+                    time.sleep(delay)
+
+            return results
+        finally:
+            db.close()
--- a/templates/admin/data_quality_dashboard.html
+++ b/templates/admin/data_quality_dashboard.html
@ -455,6 +455,25 @@
        background: #dbeafe;
    }

+    /* Discovery badges */
+    .disc-badge {
+        display: inline-block;
+        padding: 1px 6px;
+        border-radius: var(--radius);
+        font-size: var(--font-size-xs);
+        font-weight: 500;
+    }
+
+    .disc-badge.disc-match {
+        background: #dcfce7;
+        color: #166534;
+    }
+
+    .disc-badge.disc-miss {
+        background: #f3f4f6;
+        color: #9ca3af;
+    }
+
    /* Responsive */
    @media (max-width: 768px) {
        .dq-bar-label { width: 100px; font-size: var(--font-size-xs); }
@ -602,6 +621,115 @@
 </div>
 {% endif %}

+<!-- Website Discovery Section -->
+<div class="dq-section">
+    <div style="display: flex; justify-content: space-between; align-items: center; margin-bottom: var(--spacing-lg);">
+        <div class="dq-section-title" style="margin-bottom: 0;">
+            Odkryte strony WWW
+            {% if discovery_data %}
+            <span style="font-size: var(--font-size-sm); color: var(--text-secondary); font-weight: 400;">({{ discovery_data|length }} kandydatów)</span>
+            {% endif %}
+        </div>
+        <div style="display: flex; gap: var(--spacing-sm); align-items: center;">
+            {% if companies_without_website > 0 %}
+            <span style="font-size: var(--font-size-xs); color: var(--text-secondary);">{{ companies_without_website }} firm bez WWW</span>
+            {% endif %}
+            <button onclick="discoverWebsitesBulk()" class="dq-bulk-btn"
+                    style="background: var(--primary); color: white; padding: var(--spacing-sm) var(--spacing-lg); border-radius: var(--radius);">
+                Szukaj WWW
+            </button>
+        </div>
+    </div>
+
+    {% if discovery_data %}
+    <table class="dq-table" id="discoveryTable">
+        <thead>
+            <tr>
+                <th style="width: 5px;"></th>
+                <th>Firma</th>
+                <th>Strona</th>
+                <th>Dopasowania</th>
+                <th style="width: 80px;">Akcja</th>
+            </tr>
+        </thead>
+        <tbody>
+            {% for d in discovery_data %}
+            <tr id="disc-row-{{ d.id }}" style="border-left: 3px solid {% if d.confidence == 'high' %}#22c55e{% elif d.confidence == 'medium' %}#f59e0b{% else %}#d1d5db{% endif %};">
+                <td></td>
+                <td>
+                    <a href="{{ url_for('admin.admin_company_detail', company_id=d.company_id) }}" class="dq-company-link">{{ d.company_name }}</a>
+                </td>
+                <td>
+                    <a href="{{ d.url }}" target="_blank" style="color: var(--primary); text-decoration: none; font-weight: 500;">{{ d.domain }}</a>
+                    {% if d.title %}
+                    <br><span style="font-size: var(--font-size-xs); color: var(--text-secondary);">{{ d.title[:60] }}</span>
+                    {% endif %}
+                </td>
+                <td>
+                    <div style="display: flex; gap: 4px; flex-wrap: wrap;">
+                        {% if d.has_nip %}
+                        <span class="disc-badge {% if d.match_nip %}disc-match{% else %}disc-miss{% endif %}">NIP</span>
+                        {% endif %}
+                        {% if d.has_regon %}
+                        <span class="disc-badge {% if d.match_regon %}disc-match{% else %}disc-miss{% endif %}">REGON</span>
+                        {% endif %}
+                        {% if d.has_krs %}
+                        <span class="disc-badge {% if d.match_krs %}disc-match{% else %}disc-miss{% endif %}">KRS</span>
+                        {% endif %}
+                        {% if d.has_phone %}
+                        <span class="disc-badge {% if d.match_phone %}disc-match{% else %}disc-miss{% endif %}">Tel</span>
+                        {% endif %}
+                        {% if d.has_email %}
+                        <span class="disc-badge {% if d.match_email %}disc-match{% else %}disc-miss{% endif %}">Email</span>
+                        {% endif %}
+                        {% if d.has_city %}
+                        <span class="disc-badge {% if d.match_city %}disc-match{% else %}disc-miss{% endif %}">Miasto</span>
+                        {% endif %}
+                        {% if d.has_owner %}
+                        <span class="disc-badge {% if d.match_owner %}disc-match{% else %}disc-miss{% endif %}">Właściciel</span>
+                        {% endif %}
+                    </div>
+                </td>
+                <td>
+                    <div style="display: flex; gap: 4px;">
+                        <button class="dq-action-btn" style="background: #dcfce7; border-color: #86efac; color: #166534;" title="Zatwierdź" onclick="acceptDiscovery({{ d.id }}, 'disc-row-{{ d.id }}')">
+                            <svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="3"><path d="M5 13l4 4L19 7"/></svg>
+                        </button>
+                        <button class="dq-action-btn" style="background: #fee2e2; border-color: #fca5a5; color: #991b1b;" title="Odrzuć" onclick="rejectDiscovery({{ d.id }}, 'disc-row-{{ d.id }}')">
+                            <svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="3"><path d="M6 18L18 6M6 6l12 12"/></svg>
+                        </button>
+                    </div>
+                </td>
+            </tr>
+            {% endfor %}
+        </tbody>
+    </table>
+    {% else %}
+    <p style="color: var(--text-secondary); font-size: var(--font-size-sm);">Brak kandydatów. Kliknij "Szukaj WWW" aby uruchomić wyszukiwanie.</p>
+    {% endif %}
+</div>
+
+<!-- Bulk Discovery Modal -->
+<div id="discoveryModal" style="display: none; position: fixed; top: 0; left: 0; right: 0; bottom: 0; background: rgba(0,0,0,0.5); z-index: 9999; align-items: center; justify-content: center;">
+    <div style="background: var(--surface); border-radius: var(--radius-xl); padding: var(--spacing-xl); max-width: 500px; width: 90%; max-height: 80vh; overflow-y: auto;">
+        <h3 style="margin-bottom: var(--spacing-lg);">Wyszukiwanie stron WWW</h3>
+        <p style="color: var(--text-secondary); margin-bottom: var(--spacing-lg);">
+            Szukam stron internetowych dla firm bez uzupełnionej strony WWW...
+        </p>
+        <div style="display: flex; justify-content: space-between; margin-bottom: var(--spacing-sm);">
+            <span style="font-weight: 600;">Postęp</span>
+            <span id="discProgressText">0/0</span>
+        </div>
+        <div style="height: 8px; background: var(--background); border-radius: 4px; overflow: hidden;">
+            <div id="discProgressBar" style="height: 100%; background: var(--primary); border-radius: 4px; transition: width 0.3s; width: 0%;"></div>
+        </div>
+        <div id="discProgressLog" style="margin-top: var(--spacing-md); max-height: 250px; overflow-y: auto; font-size: var(--font-size-xs); font-family: monospace; color: var(--text-secondary);"></div>
+        <div style="margin-top: var(--spacing-lg); text-align: right;">
+            <button id="discCloseBtn" onclick="closeDiscoveryModal()" style="display: none; padding: var(--spacing-sm) var(--spacing-lg); border: 1px solid var(--border); border-radius: var(--radius); background: var(--surface); cursor: pointer;">Zamknij</button>
+        </div>
+    </div>
+</div>
+
 <!-- Companies Table -->
 <div class="dq-section">
    <div class="dq-section-title">Firmy wg kompletności danych</div>
@ -1037,4 +1165,123 @@
            if (btn && !btn.disabled) btn.click();
        });
    }
+
+    // --- Website Discovery ---
+    function discoverWebsite(companyId, btn) {
+        if (btn.disabled) return;
+        var originalHTML = btn.innerHTML;
+        btn.disabled = true;
+        btn.innerHTML = '...';
+
+        var csrf = document.querySelector('meta[name=csrf-token]')?.content || '';
+        fetch('/admin/discover-website/' + companyId, {
+            method: 'POST',
+            headers: {'Content-Type': 'application/json', 'X-CSRFToken': csrf}
+        })
+        .then(function(r) { return r.json(); })
+        .then(function(data) {
+            if (data.success && data.status === 'found') {
+                btn.innerHTML = '<svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="#22c55e" stroke-width="3"><path d="M5 13l4 4L19 7"/></svg>';
+                btn.title = 'Znaleziono: ' + (data.url || '');
+            } else {
+                btn.innerHTML = originalHTML;
+                btn.disabled = false;
+                btn.title = data.error || 'Brak wyników';
+            }
+        })
+        .catch(function(err) {
+            btn.innerHTML = originalHTML;
+            btn.disabled = false;
+            btn.title = 'Błąd: ' + err.message;
+        });
+    }
+
+    function discoverWebsitesBulk() {
+        document.getElementById('discoveryModal').style.display = 'flex';
+        document.getElementById('discCloseBtn').style.display = 'none';
+        document.getElementById('discProgressBar').style.width = '0%';
+        document.getElementById('discProgressText').textContent = 'Uruchamiam...';
+        document.getElementById('discProgressLog').innerHTML = '';
+
+        var csrf = document.querySelector('meta[name=csrf-token]')?.content || '';
+        fetch('/admin/discover-websites-bulk', {
+            method: 'POST',
+            headers: {'Content-Type': 'application/json', 'X-CSRFToken': csrf}
+        })
+        .then(function(r) { return r.json(); })
+        .then(function(data) {
+            if (data.job_id) {
+                pollDiscoveryProgress(data.job_id);
+            }
+        })
+        .catch(function(err) {
+            document.getElementById('discProgressLog').innerHTML = '<div style="color: #ef4444;">Błąd: ' + err.message + '</div>';
+            document.getElementById('discCloseBtn').style.display = 'inline-block';
+        });
+    }
+
+    function pollDiscoveryProgress(jobId) {
+        fetch('/admin/discover-websites-status?job_id=' + jobId)
+        .then(function(r) { return r.json(); })
+        .then(function(data) {
+            var total = data.total || 1;
+            var processed = data.processed || 0;
+            var pct = Math.round(processed / total * 100);
+            document.getElementById('discProgressBar').style.width = pct + '%';
+            document.getElementById('discProgressText').textContent = processed + '/' + total;
+
+            if (data.latest_result) {
+                var log = document.getElementById('discProgressLog');
+                log.innerHTML += '<div>' + data.latest_result + '</div>';
+                log.scrollTop = log.scrollHeight;
+            }
+
+            if (data.status === 'running') {
+                setTimeout(function() { pollDiscoveryProgress(jobId); }, 3000);
+            } else {
+                document.getElementById('discProgressLog').innerHTML += '<div style="color: #22c55e; font-weight: 600;">Zakończono! Odśwież stronę aby zobaczyć wyniki.</div>';
+                document.getElementById('discCloseBtn').style.display = 'inline-block';
+            }
+        });
+    }
+
+    function closeDiscoveryModal() {
+        document.getElementById('discoveryModal').style.display = 'none';
+    }
+
+    function acceptDiscovery(candidateId, rowId) {
+        var csrf = document.querySelector('meta[name=csrf-token]')?.content || '';
+        fetch('/admin/discovery/' + candidateId + '/accept', {
+            method: 'POST',
+            headers: {'Content-Type': 'application/json', 'X-CSRFToken': csrf}
+        })
+        .then(function(r) { return r.json(); })
+        .then(function(data) {
+            var row = document.getElementById(rowId);
+            if (data.success) {
+                if (row) row.style.opacity = '0.3';
+            } else {
+                alert('Błąd: ' + (data.error || 'nieznany'));
+            }
+        })
+        .catch(function(err) { alert('Błąd: ' + err.message); });
+    }
+
+    function rejectDiscovery(candidateId, rowId) {
+        var csrf = document.querySelector('meta[name=csrf-token]')?.content || '';
+        fetch('/admin/discovery/' + candidateId + '/reject', {
+            method: 'POST',
+            headers: {'Content-Type': 'application/json', 'X-CSRFToken': csrf}
+        })
+        .then(function(r) { return r.json(); })
+        .then(function(data) {
+            var row = document.getElementById(rowId);
+            if (data.success) {
+                if (row) row.remove();
+            } else {
+                alert('Błąd: ' + (data.error || 'nieznany'));
+            }
+        })
+        .catch(function(err) { alert('Błąd: ' + err.message); });
+    }
 {% endblock %}