feat: multi-candidate scoring and domain name matching for website discovery
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions

Evaluate top 3 Brave results instead of just taking the first one.
Add domain name matching signal (+2 pts when domain contains company name).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Maciej Pienczyn 2026-02-21 08:52:43 +01:00
parent b1737defa9
commit 2e0c19d427
3 changed files with 120 additions and 57 deletions

View File

@ -21,6 +21,7 @@ from database import (
)
from utils.decorators import role_required
from utils.data_quality import compute_weighted_score
from services.website_discovery_service import WebsiteDiscoveryService
logger = logging.getLogger(__name__)
@ -251,6 +252,9 @@ def admin_data_quality():
'has_email': bool(comp.email),
'has_city': bool(comp.address_city),
'has_owner': bool(getattr(comp, 'owner_name', None)),
'match_domain': WebsiteDiscoveryService()._domain_matches_company(
dc.candidate_domain or '', comp.name
),
})
# Count companies without website

View File

@ -34,6 +34,9 @@ DIRECTORY_DOMAINS = {
'analizy.pl', 'transfermarkt.pl', 'mojewejherowo.pl', 'orlyjubilerstwa.pl',
'norda-biznes.info', 'bizraport.pl', 'aplikuj.pl', 'lexspace.pl',
'drewnianeabc.pl', 'f-trust.pl', 'itspace.llc',
'biznesfinder.pl', 'egospodarka.pl', 'bazatel.pl',
'wspanialewesele.com.pl', 'wyszukiwarkakrs.pl', 'funduszowe.pl',
'itspace.company',
# Social media
'facebook.com', 'linkedin.com', 'youtube.com', 'instagram.com',
'twitter.com', 'x.com', 'tiktok.com',
@ -217,7 +220,8 @@ class WebsiteDiscoveryService:
def discover_for_company(self, company):
"""
Search for website, scrape, compare, save candidate.
Search for website, evaluate top candidates, save the best one.
Scrapes up to 3 results, scores each, picks highest score.
Returns dict with result info.
"""
if not self.brave_api_key:
@ -241,66 +245,93 @@ class WebsiteDiscoveryService:
if not urls:
return {'error': 'Brak wyników', 'company_id': company.id}
# Take best candidate (first non-directory URL)
best = urls[0]
url = best['url']
domain = urlparse(url).netloc.lower()
if domain.startswith('www.'):
domain = domain[4:]
# Evaluate top 3 candidates, pick the best
best_candidate = None
best_score = -1
# Check for existing candidate
existing = db.query(WebsiteDiscoveryCandidate).filter_by(
company_id=company.id, candidate_url=url
).first()
if existing:
return {'status': 'exists', 'candidate_id': existing.id}
for brave_result in urls[:3]:
url = brave_result['url']
domain = urlparse(url).netloc.lower()
if domain.startswith('www.'):
domain = domain[4:]
# Fetch and extract
page_text = _fetch_page_text(url)
# Check for existing candidate with this URL
existing = db.query(WebsiteDiscoveryCandidate).filter_by(
company_id=company.id, candidate_url=url
).first()
if existing:
continue
extracted = {}
if page_text:
extracted = {
'nips': _find_nips_in_text(page_text),
'regons': _find_regons_in_text(page_text),
'krs': _find_krs_in_text(page_text),
'emails': _extract_emails(page_text),
'phones': _extract_phones(page_text),
'text_snippet': page_text[:500],
}
else:
extracted = {
'nips': [], 'regons': [], 'krs': [],
'emails': [], 'phones': [], 'text_snippet': '',
# Fetch and extract
page_text = _fetch_page_text(url)
if page_text:
extracted = {
'nips': _find_nips_in_text(page_text),
'regons': _find_regons_in_text(page_text),
'krs': _find_krs_in_text(page_text),
'emails': _extract_emails(page_text),
'phones': _extract_phones(page_text),
'text_snippet': page_text[:500],
}
else:
extracted = {
'nips': [], 'regons': [], 'krs': [],
'emails': [], 'phones': [], 'text_snippet': '',
}
# Compute match signals
signals = self._compute_signals(extracted, company, page_text)
# Domain name matching bonus
domain_match = self._domain_matches_company(domain, company.name)
signals['domain'] = domain_match
confidence, score = self._compute_confidence(signals)
candidate_data = {
'url': url,
'domain': domain,
'brave_result': brave_result,
'extracted': extracted,
'signals': signals,
'confidence': confidence,
'score': score,
'page_text': page_text,
}
# Compute match signals
signals = self._compute_signals(extracted, company, page_text)
confidence, score = self._compute_confidence(signals)
if score > best_score:
best_score = score
best_candidate = candidate_data
# Save candidate
if not best_candidate:
# All URLs already exist as candidates
return {'status': 'exists', 'company_id': company.id}
# Save best candidate
c = best_candidate
candidate = WebsiteDiscoveryCandidate(
company_id=company.id,
search_query=query,
candidate_url=url,
candidate_domain=domain,
brave_title=best.get('title', ''),
brave_description=best.get('description', ''),
extracted_nips=extracted['nips'] or None,
extracted_regons=extracted['regons'] or None,
extracted_krs=extracted['krs'] or None,
extracted_phones=extracted['phones'] or None,
extracted_emails=extracted['emails'] or None,
page_text_snippet=extracted['text_snippet'] or None,
match_nip=signals.get('nip', False),
match_regon=signals.get('regon', False),
match_krs=signals.get('krs', False),
match_phone=signals.get('phone', False),
match_email=signals.get('email', False),
match_city=signals.get('city', False),
match_owner=signals.get('owner', False),
confidence=confidence,
match_score=score,
candidate_url=c['url'],
candidate_domain=c['domain'],
brave_title=c['brave_result'].get('title', ''),
brave_description=c['brave_result'].get('description', ''),
extracted_nips=c['extracted']['nips'] or None,
extracted_regons=c['extracted']['regons'] or None,
extracted_krs=c['extracted']['krs'] or None,
extracted_phones=c['extracted']['phones'] or None,
extracted_emails=c['extracted']['emails'] or None,
page_text_snippet=c['extracted']['text_snippet'] or None,
match_nip=c['signals'].get('nip', False),
match_regon=c['signals'].get('regon', False),
match_krs=c['signals'].get('krs', False),
match_phone=c['signals'].get('phone', False),
match_email=c['signals'].get('email', False),
match_city=c['signals'].get('city', False),
match_owner=c['signals'].get('owner', False),
confidence=c['confidence'],
match_score=c['score'],
)
db.add(candidate)
db.commit()
@ -308,10 +339,10 @@ class WebsiteDiscoveryService:
return {
'status': 'found',
'candidate_id': candidate.id,
'url': url,
'confidence': confidence,
'score': score,
'signals': signals,
'url': c['url'],
'confidence': c['confidence'],
'score': c['score'],
'signals': c['signals'],
}
except Exception as e:
db.rollback()
@ -430,14 +461,41 @@ class WebsiteDiscoveryService:
return signals
def _domain_matches_company(self, domain, company_name):
"""Check if domain name contains normalized company name."""
if not domain or not company_name:
return False
# Normalize: lowercase, remove common suffixes, special chars
name = company_name.lower()
# Remove legal forms
for suffix in [' sp. z o.o.', ' sp.z o.o.', ' s.a.', ' s.c.', ' sp.j.',
' sp. k.', ' sp.p.', ' sp. z o. o.']:
name = name.replace(suffix, '')
# Remove special chars, keep only letters and digits
name = re.sub(r'[^a-z0-9ąćęłńóśźż]', '', name)
# Polish char mapping for domain comparison
pl_map = {'ą': 'a', 'ć': 'c', 'ę': 'e', 'ł': 'l', 'ń': 'n',
'ó': 'o', 'ś': 's', 'ź': 'z', 'ż': 'z'}
name_ascii = ''.join(pl_map.get(c, c) for c in name)
# Get domain without TLD
domain_base = domain.split('.')[0].lower()
domain_base = re.sub(r'[^a-z0-9]', '', domain_base)
# Match if domain base contains the company name (or vice versa for short names)
if len(name_ascii) >= 3 and (name_ascii in domain_base or domain_base in name_ascii):
return True
return False
def _compute_confidence(self, signals):
"""Compute confidence level and numeric score."""
weights = {
'nip': 3, 'regon': 3, 'krs': 3,
'phone': 2, 'email': 2,
'city': 1, 'owner': 1,
'domain': 2,
}
score = sum(weights[k] for k, v in signals.items() if v)
score = sum(weights.get(k, 0) for k, v in signals.items() if v)
if score >= 5:
return 'high', score

View File

@ -713,6 +713,7 @@
{% if d.has_owner %}
<span class="disc-badge {% if d.match_owner %}disc-match{% else %}disc-miss{% endif %}">Właściciel</span>
{% endif %}
<span class="disc-badge {% if d.match_domain %}disc-match{% else %}disc-miss{% endif %}">Domena</span>
</div>
</td>
<td>