fix: strengthen Google Places name validation to prevent cross-company data contamination
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions

- Replace substring matching with word-boundary tokenized matching
- Short names (1-2 words): require ALL significant words to match
- Longer names (3+): require at least 50% word overlap
- Pick best-scoring result instead of first match
- Add company_name validation to competitor_monitoring_service
- Show Google profile name in dashboard hints for admin verification
- Display mismatch warning when Google name differs from company name

Prevents cases like "IT Space" matching "Body Space" (score 0.50 < 1.00 threshold).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Maciej Pienczyn 2026-02-21 07:37:05 +01:00
parent e0bb6b718a
commit ae9a658b0c
6 changed files with 81 additions and 15 deletions

View File

@ -741,10 +741,13 @@ def admin_company_detail(company_id):
# --- Hints: where to find missing data --- # --- Hints: where to find missing data ---
hints = {} hints = {}
analysis = seo_analysis # CompanyWebsiteAnalysis object or None analysis = seo_analysis # CompanyWebsiteAnalysis object or None
# Google name for mismatch warning (e.g. "IT Space" vs "Body Space")
google_name = (analysis.google_name or '') if analysis else ''
google_name_mismatch = bool(google_name and google_name.lower() != company.name.lower())
if not company.phone: if not company.phone:
if analysis and analysis.google_phone: if analysis and analysis.google_phone:
hints['Telefon'] = {'source': 'Google Business', 'value': analysis.google_phone, 'action': 'apply'} hints['Telefon'] = {'source': 'Google Business', 'value': analysis.google_phone, 'action': 'apply', 'google_name': google_name if google_name_mismatch else ''}
elif analysis and analysis.nap_on_website: elif analysis and analysis.nap_on_website:
nap = analysis.nap_on_website if isinstance(analysis.nap_on_website, dict) else {} nap = analysis.nap_on_website if isinstance(analysis.nap_on_website, dict) else {}
if nap.get('phone'): if nap.get('phone'):
@ -760,11 +763,11 @@ def admin_company_detail(company_id):
if not company.website: if not company.website:
if analysis and analysis.google_website: if analysis and analysis.google_website:
hints['Strona WWW'] = {'source': 'Google Business', 'value': analysis.google_website, 'action': 'apply'} hints['Strona WWW'] = {'source': 'Google Business', 'value': analysis.google_website, 'action': 'apply', 'google_name': google_name if google_name_mismatch else ''}
if not company.address_city: if not company.address_city:
if analysis and analysis.google_address: if analysis and analysis.google_address:
hints['Adres'] = {'source': 'Google Business', 'value': analysis.google_address, 'action': 'apply'} hints['Adres'] = {'source': 'Google Business', 'value': analysis.google_address, 'action': 'apply', 'google_name': google_name if google_name_mismatch else ''}
if not company.description_short: if not company.description_short:
if analysis and analysis.content_summary: if analysis and analysis.content_summary:

View File

@ -183,6 +183,7 @@ def admin_data_quality():
avg_score = round(score_sum / total) if total > 0 else 0 avg_score = round(score_sum / total) if total > 0 else 0
# Available data: companies where Google has data but company profile is empty # Available data: companies where Google has data but company profile is empty
# Include google_name so admin can verify the match is correct
available_data = [] available_data = []
analyses = db.query(CompanyWebsiteAnalysis).all() analyses = db.query(CompanyWebsiteAnalysis).all()
company_map = {c.id: c for c in companies} company_map = {c.id: c for c in companies}
@ -191,20 +192,24 @@ def admin_data_quality():
comp = company_map.get(a.company_id) comp = company_map.get(a.company_id)
if not comp: if not comp:
continue continue
g_name = a.google_name or ''
if a.google_phone and not comp.phone: if a.google_phone and not comp.phone:
available_data.append({ available_data.append({
'company_id': comp.id, 'company_name': comp.name, 'company_slug': comp.slug, 'company_id': comp.id, 'company_name': comp.name, 'company_slug': comp.slug,
'field': 'Telefon', 'source': 'Google Business', 'value': a.google_phone 'field': 'Telefon', 'source': 'Google Business', 'value': a.google_phone,
'google_name': g_name,
}) })
if a.google_website and not comp.website: if a.google_website and not comp.website:
available_data.append({ available_data.append({
'company_id': comp.id, 'company_name': comp.name, 'company_slug': comp.slug, 'company_id': comp.id, 'company_name': comp.name, 'company_slug': comp.slug,
'field': 'Strona WWW', 'source': 'Google Business', 'value': a.google_website 'field': 'Strona WWW', 'source': 'Google Business', 'value': a.google_website,
'google_name': g_name,
}) })
if a.google_address and not comp.address_city: if a.google_address and not comp.address_city:
available_data.append({ available_data.append({
'company_id': comp.id, 'company_name': comp.name, 'company_slug': comp.slug, 'company_id': comp.id, 'company_name': comp.name, 'company_slug': comp.slug,
'field': 'Adres', 'source': 'Google Business', 'value': a.google_address 'field': 'Adres', 'source': 'Google Business', 'value': a.google_address,
'google_name': g_name,
}) })
return render_template( return render_template(

View File

@ -72,7 +72,8 @@ class CompetitorMonitoringService:
logger.warning(f"Company {company_id} has no Google Place ID, searching by name") logger.warning(f"Company {company_id} has no Google Place ID, searching by name")
# Search for the company first # Search for the company first
place = self.places_service.search_place( place = self.places_service.search_place(
f"{company.name} {company.address_city or 'Wejherowo'}" f"{company.name} {company.address_city or 'Wejherowo'}",
company_name=company.name
) )
if not place: if not place:
return [] return []

View File

@ -136,6 +136,38 @@ class GooglePlacesService:
logger.error(f"Places API request error for {place_id}: {e}") logger.error(f"Places API request error for {place_id}: {e}")
return None return None
@staticmethod
def _tokenize_name(name: str) -> set:
"""Tokenize a company name into significant lowercase words."""
import re as _re
skip_words = {
'sp', 'z', 'o', 'oo', 'sa', 'sc', 'j', 'k', 'ul', 'i', 'w',
'do', 'na', 'po', 'ze', 'the', 'and', 'of', 'for', 'group',
}
# Split on non-alphanumeric, keep words
words = _re.findall(r'[a-ząćęłńóśźż0-9]+', name.lower())
return {w for w in words if len(w) > 1 and w not in skip_words}
@staticmethod
def _name_match_score(company_name: str, google_name: str) -> float:
"""
Compute name match score between company name and Google result name.
Returns float 0.0-1.0:
- 1.0 = all significant company words found in Google name
- 0.0 = no words matched
Uses word-boundary matching (not substring) to prevent
'IT' matching 'digital' or 'Space' matching 'Body Space' alone.
"""
company_words = GooglePlacesService._tokenize_name(company_name)
google_words = GooglePlacesService._tokenize_name(google_name)
if not company_words:
return 0.0
matched = company_words & google_words
return len(matched) / len(company_words)
def search_place(self, query: str, location_bias: Dict = None, def search_place(self, query: str, location_bias: Dict = None,
company_name: str = None) -> Optional[Dict[str, Any]]: company_name: str = None) -> Optional[Dict[str, Any]]:
""" """
@ -185,17 +217,32 @@ class GooglePlacesService:
if not company_name: if not company_name:
return places[0] return places[0]
# Validate: at least one significant word from company name must appear in result name # Validate: company name must significantly match Google result name.
skip_words = {'sp', 'z', 'o', 'oo', 'sa', 'sc', 'j', 'k', 'ul', 'i', 'w', 'do', 'na', 'po', 'ze'} # Uses word-boundary matching with minimum threshold:
name_words = {w.lower() for w in company_name.split() if len(w) > 1 and w.lower() not in skip_words} # - Short names (1-2 significant words): ALL words must match
# - Longer names (3+ words): at least 50% of words must match
company_words = self._tokenize_name(company_name)
min_ratio = 1.0 if len(company_words) <= 2 else 0.5
best_place = None
best_score = 0.0
for place in places: for place in places:
google_name = place.get('displayName', {}).get('text', '').lower() google_name = place.get('displayName', {}).get('text', '')
if any(word in google_name for word in name_words): score = self._name_match_score(company_name, google_name)
return place if score >= min_ratio and score > best_score:
best_score = score
best_place = place
if best_place:
matched_name = best_place.get('displayName', {}).get('text', '')
logger.info(
f"Name match for '{company_name}': '{matched_name}' (score={best_score:.2f})"
)
return best_place
logger.warning( logger.warning(
f"No name match for '{company_name}' in Google results: " f"No name match for '{company_name}' (min_ratio={min_ratio:.0%}) in Google results: "
f"{[p.get('displayName', {}).get('text', '') for p in places]}" f"{[p.get('displayName', {}).get('text', '') for p in places]}"
) )
return None return None

View File

@ -841,6 +841,11 @@
<span style="font-size: var(--font-size-xs); color: var(--primary);"> <span style="font-size: var(--font-size-xs); color: var(--primary);">
{{ hints[field_name].source }}{% if hints[field_name].value %}: {{ hints[field_name].value[:40] }}{% endif %} {{ hints[field_name].source }}{% if hints[field_name].value %}: {{ hints[field_name].value[:40] }}{% endif %}
</span> </span>
{% if hints[field_name].get('google_name') %}
<span style="font-size: var(--font-size-xs); color: #d97706;" title="Nazwa profilu Google nie zgadza się z nazwą firmy — zweryfikuj!">
(profil: {{ hints[field_name].google_name[:30] }})
</span>
{% endif %}
{% if hints[field_name].action == 'apply' and hints[field_name].value %} {% if hints[field_name].action == 'apply' and hints[field_name].value %}
<button onclick="applyHint({{ company.id }}, '{{ field_name }}', '{{ hints[field_name].value|e }}')" <button onclick="applyHint({{ company.id }}, '{{ field_name }}', '{{ hints[field_name].value|e }}')"
class="hint-apply-btn" title="Uzupełnij to pole">Uzupełnij</button> class="hint-apply-btn" title="Uzupełnij to pole">Uzupełnij</button>

View File

@ -582,7 +582,12 @@
<tr id="avail-row-{{ loop.index }}"> <tr id="avail-row-{{ loop.index }}">
<td><a href="{{ url_for('admin.admin_company_detail', company_id=item.company_id) }}" class="dq-company-link">{{ item.company_name }}</a></td> <td><a href="{{ url_for('admin.admin_company_detail', company_id=item.company_id) }}" class="dq-company-link">{{ item.company_name }}</a></td>
<td>{{ item.field }}</td> <td>{{ item.field }}</td>
<td><span style="font-size: var(--font-size-xs); color: var(--text-secondary);">{{ item.source }}</span></td> <td>
<span style="font-size: var(--font-size-xs); color: var(--text-secondary);">{{ item.source }}</span>
{% if item.google_name and item.google_name.lower() != item.company_name.lower() %}
<br><span style="font-size: var(--font-size-xs); color: #d97706;" title="Nazwa profilu Google — zweryfikuj dopasowanie">Profil: {{ item.google_name[:40] }}</span>
{% endif %}
</td>
<td style="font-size: var(--font-size-sm);">{{ item.value[:50] }}</td> <td style="font-size: var(--font-size-sm);">{{ item.value[:50] }}</td>
<td> <td>
<button class="hint-apply-btn" onclick="applyAvailableHint({{ item.company_id }}, '{{ item.field }}', '{{ item.value|e }}', 'avail-row-{{ loop.index }}')" <button class="hint-apply-btn" onclick="applyAvailableHint({{ item.company_id }}, '{{ item.field }}', '{{ item.value|e }}', 'avail-row-{{ loop.index }}')"