feat: multi-candidate scoring and domain name matching for website discovery

Evaluate top 3 Brave results instead of just taking the first one. Add domain name matching signal (+2 pts when domain contains company name). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-21 08:52:43 +01:00 · 2026-02-21 08:52:43 +01:00 · 2e0c19d427
commit 2e0c19d427
parent b1737defa9
3 changed files with 120 additions and 57 deletions
--- a/blueprints/admin/routes_data_quality.py
+++ b/blueprints/admin/routes_data_quality.py
@ -21,6 +21,7 @@ from database import (
 )
 from utils.decorators import role_required
 from utils.data_quality import compute_weighted_score
+from services.website_discovery_service import WebsiteDiscoveryService

 logger = logging.getLogger(__name__)

@ -251,6 +252,9 @@ def admin_data_quality():
                'has_email': bool(comp.email),
                'has_city': bool(comp.address_city),
                'has_owner': bool(getattr(comp, 'owner_name', None)),
+                'match_domain': WebsiteDiscoveryService()._domain_matches_company(
+                    dc.candidate_domain or '', comp.name
+                ),
            })

        # Count companies without website
--- a/services/website_discovery_service.py
+++ b/services/website_discovery_service.py
@ -34,6 +34,9 @@ DIRECTORY_DOMAINS = {
    'analizy.pl', 'transfermarkt.pl', 'mojewejherowo.pl', 'orlyjubilerstwa.pl',
    'norda-biznes.info', 'bizraport.pl', 'aplikuj.pl', 'lexspace.pl',
    'drewnianeabc.pl', 'f-trust.pl', 'itspace.llc',
+    'biznesfinder.pl', 'egospodarka.pl', 'bazatel.pl',
+    'wspanialewesele.com.pl', 'wyszukiwarkakrs.pl', 'funduszowe.pl',
+    'itspace.company',
    # Social media
    'facebook.com', 'linkedin.com', 'youtube.com', 'instagram.com',
    'twitter.com', 'x.com', 'tiktok.com',
@ -217,7 +220,8 @@ class WebsiteDiscoveryService:

    def discover_for_company(self, company):
        """
-        Search for website, scrape, compare, save candidate.
+        Search for website, evaluate top candidates, save the best one.
+        Scrapes up to 3 results, scores each, picks highest score.
        Returns dict with result info.
        """
        if not self.brave_api_key:
@ -241,66 +245,93 @@ class WebsiteDiscoveryService:
            if not urls:
                return {'error': 'Brak wyników', 'company_id': company.id}

-            # Take best candidate (first non-directory URL)
-            best = urls[0]
-            url = best['url']
-            domain = urlparse(url).netloc.lower()
-            if domain.startswith('www.'):
-                domain = domain[4:]
+            # Evaluate top 3 candidates, pick the best
+            best_candidate = None
+            best_score = -1

-            # Check for existing candidate
-            existing = db.query(WebsiteDiscoveryCandidate).filter_by(
-                company_id=company.id, candidate_url=url
-            ).first()
-            if existing:
-                return {'status': 'exists', 'candidate_id': existing.id}
+            for brave_result in urls[:3]:
+                url = brave_result['url']
+                domain = urlparse(url).netloc.lower()
+                if domain.startswith('www.'):
+                    domain = domain[4:]

-            # Fetch and extract
-            page_text = _fetch_page_text(url)
+                # Check for existing candidate with this URL
+                existing = db.query(WebsiteDiscoveryCandidate).filter_by(
+                    company_id=company.id, candidate_url=url
+                ).first()
+                if existing:
+                    continue

-            extracted = {}
-            if page_text:
-                extracted = {
-                    'nips': _find_nips_in_text(page_text),
-                    'regons': _find_regons_in_text(page_text),
-                    'krs': _find_krs_in_text(page_text),
-                    'emails': _extract_emails(page_text),
-                    'phones': _extract_phones(page_text),
-                    'text_snippet': page_text[:500],
-                }
-            else:
-                extracted = {
-                    'nips': [], 'regons': [], 'krs': [],
-                    'emails': [], 'phones': [], 'text_snippet': '',
+                # Fetch and extract
+                page_text = _fetch_page_text(url)
+
+                if page_text:
+                    extracted = {
+                        'nips': _find_nips_in_text(page_text),
+                        'regons': _find_regons_in_text(page_text),
+                        'krs': _find_krs_in_text(page_text),
+                        'emails': _extract_emails(page_text),
+                        'phones': _extract_phones(page_text),
+                        'text_snippet': page_text[:500],
+                    }
+                else:
+                    extracted = {
+                        'nips': [], 'regons': [], 'krs': [],
+                        'emails': [], 'phones': [], 'text_snippet': '',
+                    }
+
+                # Compute match signals
+                signals = self._compute_signals(extracted, company, page_text)
+
+                # Domain name matching bonus
+                domain_match = self._domain_matches_company(domain, company.name)
+                signals['domain'] = domain_match
+
+                confidence, score = self._compute_confidence(signals)
+
+                candidate_data = {
+                    'url': url,
+                    'domain': domain,
+                    'brave_result': brave_result,
+                    'extracted': extracted,
+                    'signals': signals,
+                    'confidence': confidence,
+                    'score': score,
+                    'page_text': page_text,
                }

-            # Compute match signals
-            signals = self._compute_signals(extracted, company, page_text)
-            confidence, score = self._compute_confidence(signals)
+                if score > best_score:
+                    best_score = score
+                    best_candidate = candidate_data

-            # Save candidate
+            if not best_candidate:
+                # All URLs already exist as candidates
+                return {'status': 'exists', 'company_id': company.id}
+
+            # Save best candidate
+            c = best_candidate
            candidate = WebsiteDiscoveryCandidate(
                company_id=company.id,
                search_query=query,
-                candidate_url=url,
-                candidate_domain=domain,
-                brave_title=best.get('title', ''),
-                brave_description=best.get('description', ''),
-                extracted_nips=extracted['nips'] or None,
-                extracted_regons=extracted['regons'] or None,
-                extracted_krs=extracted['krs'] or None,
-                extracted_phones=extracted['phones'] or None,
-                extracted_emails=extracted['emails'] or None,
-                page_text_snippet=extracted['text_snippet'] or None,
-                match_nip=signals.get('nip', False),
-                match_regon=signals.get('regon', False),
-                match_krs=signals.get('krs', False),
-                match_phone=signals.get('phone', False),
-                match_email=signals.get('email', False),
-                match_city=signals.get('city', False),
-                match_owner=signals.get('owner', False),
-                confidence=confidence,
-                match_score=score,
+                candidate_url=c['url'],
+                candidate_domain=c['domain'],
+                brave_title=c['brave_result'].get('title', ''),
+                brave_description=c['brave_result'].get('description', ''),
+                extracted_nips=c['extracted']['nips'] or None,
+                extracted_regons=c['extracted']['regons'] or None,
+                extracted_krs=c['extracted']['krs'] or None,
+                extracted_phones=c['extracted']['phones'] or None,
+                extracted_emails=c['extracted']['emails'] or None,
+                page_text_snippet=c['extracted']['text_snippet'] or None,
+                match_nip=c['signals'].get('nip', False),
+                match_regon=c['signals'].get('regon', False),
+                match_krs=c['signals'].get('krs', False),
+                match_phone=c['signals'].get('phone', False),
+                match_email=c['signals'].get('email', False),
+                match_city=c['signals'].get('city', False),
+                match_owner=c['signals'].get('owner', False),
+                confidence=c['confidence'],
+                match_score=c['score'],
            )
            db.add(candidate)
            db.commit()
@ -308,10 +339,10 @@ class WebsiteDiscoveryService:
            return {
                'status': 'found',
                'candidate_id': candidate.id,
-                'url': url,
-                'confidence': confidence,
-                'score': score,
-                'signals': signals,
+                'url': c['url'],
+                'confidence': c['confidence'],
+                'score': c['score'],
+                'signals': c['signals'],
            }
        except Exception as e:
            db.rollback()
@ -430,14 +461,41 @@ class WebsiteDiscoveryService:

        return signals

+    def _domain_matches_company(self, domain, company_name):
+        """Check if domain name contains normalized company name."""
+        if not domain or not company_name:
+            return False
+        # Normalize: lowercase, remove common suffixes, special chars
+        name = company_name.lower()
+        # Remove legal forms
+        for suffix in [' sp. z o.o.', ' sp.z o.o.', ' s.a.', ' s.c.', ' sp.j.',
+                       ' sp. k.', ' sp.p.', ' sp. z o. o.']:
+            name = name.replace(suffix, '')
+        # Remove special chars, keep only letters and digits
+        name = re.sub(r'[^a-z0-9ąćęłńóśźż]', '', name)
+        # Polish char mapping for domain comparison
+        pl_map = {'ą': 'a', 'ć': 'c', 'ę': 'e', 'ł': 'l', 'ń': 'n',
+                  'ó': 'o', 'ś': 's', 'ź': 'z', 'ż': 'z'}
+        name_ascii = ''.join(pl_map.get(c, c) for c in name)
+
+        # Get domain without TLD
+        domain_base = domain.split('.')[0].lower()
+        domain_base = re.sub(r'[^a-z0-9]', '', domain_base)
+
+        # Match if domain base contains the company name (or vice versa for short names)
+        if len(name_ascii) >= 3 and (name_ascii in domain_base or domain_base in name_ascii):
+            return True
+        return False
+
    def _compute_confidence(self, signals):
        """Compute confidence level and numeric score."""
        weights = {
            'nip': 3, 'regon': 3, 'krs': 3,
            'phone': 2, 'email': 2,
            'city': 1, 'owner': 1,
+            'domain': 2,
        }
-        score = sum(weights[k] for k, v in signals.items() if v)
+        score = sum(weights.get(k, 0) for k, v in signals.items() if v)

        if score >= 5:
            return 'high', score
--- a/templates/admin/data_quality_dashboard.html
+++ b/templates/admin/data_quality_dashboard.html
@ -713,6 +713,7 @@
                        {% if d.has_owner %}
                        <span class="disc-badge {% if d.match_owner %}disc-match{% else %}disc-miss{% endif %}">Właściciel</span>
                        {% endif %}
+                        <span class="disc-badge {% if d.match_domain %}disc-match{% else %}disc-miss{% endif %}">Domena</span>
                    </div>
                </td>
                <td>