diff --git a/blueprints/admin/routes_data_quality.py b/blueprints/admin/routes_data_quality.py index 74d8001..7b343ac 100644 --- a/blueprints/admin/routes_data_quality.py +++ b/blueprints/admin/routes_data_quality.py @@ -21,6 +21,7 @@ from database import ( ) from utils.decorators import role_required from utils.data_quality import compute_weighted_score +from services.website_discovery_service import WebsiteDiscoveryService logger = logging.getLogger(__name__) @@ -251,6 +252,9 @@ def admin_data_quality(): 'has_email': bool(comp.email), 'has_city': bool(comp.address_city), 'has_owner': bool(getattr(comp, 'owner_name', None)), + 'match_domain': WebsiteDiscoveryService()._domain_matches_company( + dc.candidate_domain or '', comp.name + ), }) # Count companies without website diff --git a/services/website_discovery_service.py b/services/website_discovery_service.py index b8d5567..fca467c 100644 --- a/services/website_discovery_service.py +++ b/services/website_discovery_service.py @@ -34,6 +34,9 @@ DIRECTORY_DOMAINS = { 'analizy.pl', 'transfermarkt.pl', 'mojewejherowo.pl', 'orlyjubilerstwa.pl', 'norda-biznes.info', 'bizraport.pl', 'aplikuj.pl', 'lexspace.pl', 'drewnianeabc.pl', 'f-trust.pl', 'itspace.llc', + 'biznesfinder.pl', 'egospodarka.pl', 'bazatel.pl', + 'wspanialewesele.com.pl', 'wyszukiwarkakrs.pl', 'funduszowe.pl', + 'itspace.company', # Social media 'facebook.com', 'linkedin.com', 'youtube.com', 'instagram.com', 'twitter.com', 'x.com', 'tiktok.com', @@ -217,7 +220,8 @@ class WebsiteDiscoveryService: def discover_for_company(self, company): """ - Search for website, scrape, compare, save candidate. + Search for website, evaluate top candidates, save the best one. + Scrapes up to 3 results, scores each, picks highest score. Returns dict with result info. """ if not self.brave_api_key: @@ -241,66 +245,93 @@ class WebsiteDiscoveryService: if not urls: return {'error': 'Brak wyników', 'company_id': company.id} - # Take best candidate (first non-directory URL) - best = urls[0] - url = best['url'] - domain = urlparse(url).netloc.lower() - if domain.startswith('www.'): - domain = domain[4:] + # Evaluate top 3 candidates, pick the best + best_candidate = None + best_score = -1 - # Check for existing candidate - existing = db.query(WebsiteDiscoveryCandidate).filter_by( - company_id=company.id, candidate_url=url - ).first() - if existing: - return {'status': 'exists', 'candidate_id': existing.id} + for brave_result in urls[:3]: + url = brave_result['url'] + domain = urlparse(url).netloc.lower() + if domain.startswith('www.'): + domain = domain[4:] - # Fetch and extract - page_text = _fetch_page_text(url) + # Check for existing candidate with this URL + existing = db.query(WebsiteDiscoveryCandidate).filter_by( + company_id=company.id, candidate_url=url + ).first() + if existing: + continue - extracted = {} - if page_text: - extracted = { - 'nips': _find_nips_in_text(page_text), - 'regons': _find_regons_in_text(page_text), - 'krs': _find_krs_in_text(page_text), - 'emails': _extract_emails(page_text), - 'phones': _extract_phones(page_text), - 'text_snippet': page_text[:500], - } - else: - extracted = { - 'nips': [], 'regons': [], 'krs': [], - 'emails': [], 'phones': [], 'text_snippet': '', + # Fetch and extract + page_text = _fetch_page_text(url) + + if page_text: + extracted = { + 'nips': _find_nips_in_text(page_text), + 'regons': _find_regons_in_text(page_text), + 'krs': _find_krs_in_text(page_text), + 'emails': _extract_emails(page_text), + 'phones': _extract_phones(page_text), + 'text_snippet': page_text[:500], + } + else: + extracted = { + 'nips': [], 'regons': [], 'krs': [], + 'emails': [], 'phones': [], 'text_snippet': '', + } + + # Compute match signals + signals = self._compute_signals(extracted, company, page_text) + + # Domain name matching bonus + domain_match = self._domain_matches_company(domain, company.name) + signals['domain'] = domain_match + + confidence, score = self._compute_confidence(signals) + + candidate_data = { + 'url': url, + 'domain': domain, + 'brave_result': brave_result, + 'extracted': extracted, + 'signals': signals, + 'confidence': confidence, + 'score': score, + 'page_text': page_text, } - # Compute match signals - signals = self._compute_signals(extracted, company, page_text) - confidence, score = self._compute_confidence(signals) + if score > best_score: + best_score = score + best_candidate = candidate_data - # Save candidate + if not best_candidate: + # All URLs already exist as candidates + return {'status': 'exists', 'company_id': company.id} + + # Save best candidate + c = best_candidate candidate = WebsiteDiscoveryCandidate( company_id=company.id, search_query=query, - candidate_url=url, - candidate_domain=domain, - brave_title=best.get('title', ''), - brave_description=best.get('description', ''), - extracted_nips=extracted['nips'] or None, - extracted_regons=extracted['regons'] or None, - extracted_krs=extracted['krs'] or None, - extracted_phones=extracted['phones'] or None, - extracted_emails=extracted['emails'] or None, - page_text_snippet=extracted['text_snippet'] or None, - match_nip=signals.get('nip', False), - match_regon=signals.get('regon', False), - match_krs=signals.get('krs', False), - match_phone=signals.get('phone', False), - match_email=signals.get('email', False), - match_city=signals.get('city', False), - match_owner=signals.get('owner', False), - confidence=confidence, - match_score=score, + candidate_url=c['url'], + candidate_domain=c['domain'], + brave_title=c['brave_result'].get('title', ''), + brave_description=c['brave_result'].get('description', ''), + extracted_nips=c['extracted']['nips'] or None, + extracted_regons=c['extracted']['regons'] or None, + extracted_krs=c['extracted']['krs'] or None, + extracted_phones=c['extracted']['phones'] or None, + extracted_emails=c['extracted']['emails'] or None, + page_text_snippet=c['extracted']['text_snippet'] or None, + match_nip=c['signals'].get('nip', False), + match_regon=c['signals'].get('regon', False), + match_krs=c['signals'].get('krs', False), + match_phone=c['signals'].get('phone', False), + match_email=c['signals'].get('email', False), + match_city=c['signals'].get('city', False), + match_owner=c['signals'].get('owner', False), + confidence=c['confidence'], + match_score=c['score'], ) db.add(candidate) db.commit() @@ -308,10 +339,10 @@ class WebsiteDiscoveryService: return { 'status': 'found', 'candidate_id': candidate.id, - 'url': url, - 'confidence': confidence, - 'score': score, - 'signals': signals, + 'url': c['url'], + 'confidence': c['confidence'], + 'score': c['score'], + 'signals': c['signals'], } except Exception as e: db.rollback() @@ -430,14 +461,41 @@ class WebsiteDiscoveryService: return signals + def _domain_matches_company(self, domain, company_name): + """Check if domain name contains normalized company name.""" + if not domain or not company_name: + return False + # Normalize: lowercase, remove common suffixes, special chars + name = company_name.lower() + # Remove legal forms + for suffix in [' sp. z o.o.', ' sp.z o.o.', ' s.a.', ' s.c.', ' sp.j.', + ' sp. k.', ' sp.p.', ' sp. z o. o.']: + name = name.replace(suffix, '') + # Remove special chars, keep only letters and digits + name = re.sub(r'[^a-z0-9ąćęłńóśźż]', '', name) + # Polish char mapping for domain comparison + pl_map = {'ą': 'a', 'ć': 'c', 'ę': 'e', 'ł': 'l', 'ń': 'n', + 'ó': 'o', 'ś': 's', 'ź': 'z', 'ż': 'z'} + name_ascii = ''.join(pl_map.get(c, c) for c in name) + + # Get domain without TLD + domain_base = domain.split('.')[0].lower() + domain_base = re.sub(r'[^a-z0-9]', '', domain_base) + + # Match if domain base contains the company name (or vice versa for short names) + if len(name_ascii) >= 3 and (name_ascii in domain_base or domain_base in name_ascii): + return True + return False + def _compute_confidence(self, signals): """Compute confidence level and numeric score.""" weights = { 'nip': 3, 'regon': 3, 'krs': 3, 'phone': 2, 'email': 2, 'city': 1, 'owner': 1, + 'domain': 2, } - score = sum(weights[k] for k, v in signals.items() if v) + score = sum(weights.get(k, 0) for k, v in signals.items() if v) if score >= 5: return 'high', score diff --git a/templates/admin/data_quality_dashboard.html b/templates/admin/data_quality_dashboard.html index 56eaf69..bb9b402 100644 --- a/templates/admin/data_quality_dashboard.html +++ b/templates/admin/data_quality_dashboard.html @@ -713,6 +713,7 @@ {% if d.has_owner %} Właściciel {% endif %} + Domena