From ae9a658b0c6aff2944ad985aba45256107d38729 Mon Sep 17 00:00:00 2001 From: Maciej Pienczyn Date: Sat, 21 Feb 2026 07:37:05 +0100 Subject: [PATCH] fix: strengthen Google Places name validation to prevent cross-company data contamination - Replace substring matching with word-boundary tokenized matching - Short names (1-2 words): require ALL significant words to match - Longer names (3+): require at least 50% word overlap - Pick best-scoring result instead of first match - Add company_name validation to competitor_monitoring_service - Show Google profile name in dashboard hints for admin verification - Display mismatch warning when Google name differs from company name Prevents cases like "IT Space" matching "Body Space" (score 0.50 < 1.00 threshold). Co-Authored-By: Claude Opus 4.6 --- blueprints/admin/routes_companies.py | 9 ++- blueprints/admin/routes_data_quality.py | 11 +++- competitor_monitoring_service.py | 3 +- google_places_service.py | 61 ++++++++++++++++++--- templates/admin/company_detail.html | 5 ++ templates/admin/data_quality_dashboard.html | 7 ++- 6 files changed, 81 insertions(+), 15 deletions(-) diff --git a/blueprints/admin/routes_companies.py b/blueprints/admin/routes_companies.py index e1d716f..de20922 100644 --- a/blueprints/admin/routes_companies.py +++ b/blueprints/admin/routes_companies.py @@ -741,10 +741,13 @@ def admin_company_detail(company_id): # --- Hints: where to find missing data --- hints = {} analysis = seo_analysis # CompanyWebsiteAnalysis object or None + # Google name for mismatch warning (e.g. "IT Space" vs "Body Space") + google_name = (analysis.google_name or '') if analysis else '' + google_name_mismatch = bool(google_name and google_name.lower() != company.name.lower()) if not company.phone: if analysis and analysis.google_phone: - hints['Telefon'] = {'source': 'Google Business', 'value': analysis.google_phone, 'action': 'apply'} + hints['Telefon'] = {'source': 'Google Business', 'value': analysis.google_phone, 'action': 'apply', 'google_name': google_name if google_name_mismatch else ''} elif analysis and analysis.nap_on_website: nap = analysis.nap_on_website if isinstance(analysis.nap_on_website, dict) else {} if nap.get('phone'): @@ -760,11 +763,11 @@ def admin_company_detail(company_id): if not company.website: if analysis and analysis.google_website: - hints['Strona WWW'] = {'source': 'Google Business', 'value': analysis.google_website, 'action': 'apply'} + hints['Strona WWW'] = {'source': 'Google Business', 'value': analysis.google_website, 'action': 'apply', 'google_name': google_name if google_name_mismatch else ''} if not company.address_city: if analysis and analysis.google_address: - hints['Adres'] = {'source': 'Google Business', 'value': analysis.google_address, 'action': 'apply'} + hints['Adres'] = {'source': 'Google Business', 'value': analysis.google_address, 'action': 'apply', 'google_name': google_name if google_name_mismatch else ''} if not company.description_short: if analysis and analysis.content_summary: diff --git a/blueprints/admin/routes_data_quality.py b/blueprints/admin/routes_data_quality.py index 72bd04b..1065348 100644 --- a/blueprints/admin/routes_data_quality.py +++ b/blueprints/admin/routes_data_quality.py @@ -183,6 +183,7 @@ def admin_data_quality(): avg_score = round(score_sum / total) if total > 0 else 0 # Available data: companies where Google has data but company profile is empty + # Include google_name so admin can verify the match is correct available_data = [] analyses = db.query(CompanyWebsiteAnalysis).all() company_map = {c.id: c for c in companies} @@ -191,20 +192,24 @@ def admin_data_quality(): comp = company_map.get(a.company_id) if not comp: continue + g_name = a.google_name or '' if a.google_phone and not comp.phone: available_data.append({ 'company_id': comp.id, 'company_name': comp.name, 'company_slug': comp.slug, - 'field': 'Telefon', 'source': 'Google Business', 'value': a.google_phone + 'field': 'Telefon', 'source': 'Google Business', 'value': a.google_phone, + 'google_name': g_name, }) if a.google_website and not comp.website: available_data.append({ 'company_id': comp.id, 'company_name': comp.name, 'company_slug': comp.slug, - 'field': 'Strona WWW', 'source': 'Google Business', 'value': a.google_website + 'field': 'Strona WWW', 'source': 'Google Business', 'value': a.google_website, + 'google_name': g_name, }) if a.google_address and not comp.address_city: available_data.append({ 'company_id': comp.id, 'company_name': comp.name, 'company_slug': comp.slug, - 'field': 'Adres', 'source': 'Google Business', 'value': a.google_address + 'field': 'Adres', 'source': 'Google Business', 'value': a.google_address, + 'google_name': g_name, }) return render_template( diff --git a/competitor_monitoring_service.py b/competitor_monitoring_service.py index 0c27915..cf4708d 100644 --- a/competitor_monitoring_service.py +++ b/competitor_monitoring_service.py @@ -72,7 +72,8 @@ class CompetitorMonitoringService: logger.warning(f"Company {company_id} has no Google Place ID, searching by name") # Search for the company first place = self.places_service.search_place( - f"{company.name} {company.address_city or 'Wejherowo'}" + f"{company.name} {company.address_city or 'Wejherowo'}", + company_name=company.name ) if not place: return [] diff --git a/google_places_service.py b/google_places_service.py index dbe226a..45803ee 100644 --- a/google_places_service.py +++ b/google_places_service.py @@ -136,6 +136,38 @@ class GooglePlacesService: logger.error(f"Places API request error for {place_id}: {e}") return None + @staticmethod + def _tokenize_name(name: str) -> set: + """Tokenize a company name into significant lowercase words.""" + import re as _re + skip_words = { + 'sp', 'z', 'o', 'oo', 'sa', 'sc', 'j', 'k', 'ul', 'i', 'w', + 'do', 'na', 'po', 'ze', 'the', 'and', 'of', 'for', 'group', + } + # Split on non-alphanumeric, keep words + words = _re.findall(r'[a-ząćęłńóśźż0-9]+', name.lower()) + return {w for w in words if len(w) > 1 and w not in skip_words} + + @staticmethod + def _name_match_score(company_name: str, google_name: str) -> float: + """ + Compute name match score between company name and Google result name. + + Returns float 0.0-1.0: + - 1.0 = all significant company words found in Google name + - 0.0 = no words matched + Uses word-boundary matching (not substring) to prevent + 'IT' matching 'digital' or 'Space' matching 'Body Space' alone. + """ + company_words = GooglePlacesService._tokenize_name(company_name) + google_words = GooglePlacesService._tokenize_name(google_name) + + if not company_words: + return 0.0 + + matched = company_words & google_words + return len(matched) / len(company_words) + def search_place(self, query: str, location_bias: Dict = None, company_name: str = None) -> Optional[Dict[str, Any]]: """ @@ -185,17 +217,32 @@ class GooglePlacesService: if not company_name: return places[0] - # Validate: at least one significant word from company name must appear in result name - skip_words = {'sp', 'z', 'o', 'oo', 'sa', 'sc', 'j', 'k', 'ul', 'i', 'w', 'do', 'na', 'po', 'ze'} - name_words = {w.lower() for w in company_name.split() if len(w) > 1 and w.lower() not in skip_words} + # Validate: company name must significantly match Google result name. + # Uses word-boundary matching with minimum threshold: + # - Short names (1-2 significant words): ALL words must match + # - Longer names (3+ words): at least 50% of words must match + company_words = self._tokenize_name(company_name) + min_ratio = 1.0 if len(company_words) <= 2 else 0.5 + + best_place = None + best_score = 0.0 for place in places: - google_name = place.get('displayName', {}).get('text', '').lower() - if any(word in google_name for word in name_words): - return place + google_name = place.get('displayName', {}).get('text', '') + score = self._name_match_score(company_name, google_name) + if score >= min_ratio and score > best_score: + best_score = score + best_place = place + + if best_place: + matched_name = best_place.get('displayName', {}).get('text', '') + logger.info( + f"Name match for '{company_name}': '{matched_name}' (score={best_score:.2f})" + ) + return best_place logger.warning( - f"No name match for '{company_name}' in Google results: " + f"No name match for '{company_name}' (min_ratio={min_ratio:.0%}) in Google results: " f"{[p.get('displayName', {}).get('text', '') for p in places]}" ) return None diff --git a/templates/admin/company_detail.html b/templates/admin/company_detail.html index 3396b3f..75c88bb 100644 --- a/templates/admin/company_detail.html +++ b/templates/admin/company_detail.html @@ -841,6 +841,11 @@ {{ hints[field_name].source }}{% if hints[field_name].value %}: {{ hints[field_name].value[:40] }}{% endif %} + {% if hints[field_name].get('google_name') %} + + (profil: {{ hints[field_name].google_name[:30] }}) + + {% endif %} {% if hints[field_name].action == 'apply' and hints[field_name].value %} diff --git a/templates/admin/data_quality_dashboard.html b/templates/admin/data_quality_dashboard.html index 202a44f..fd38f46 100644 --- a/templates/admin/data_quality_dashboard.html +++ b/templates/admin/data_quality_dashboard.html @@ -582,7 +582,12 @@ {{ item.company_name }} {{ item.field }} - {{ item.source }} + + {{ item.source }} + {% if item.google_name and item.google_name.lower() != item.company_name.lower() %} +
Profil: {{ item.google_name[:40] }} + {% endif %} + {{ item.value[:50] }}