fix: strengthen Google Places name validation to prevent cross-company data contamination
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
- Replace substring matching with word-boundary tokenized matching - Short names (1-2 words): require ALL significant words to match - Longer names (3+): require at least 50% word overlap - Pick best-scoring result instead of first match - Add company_name validation to competitor_monitoring_service - Show Google profile name in dashboard hints for admin verification - Display mismatch warning when Google name differs from company name Prevents cases like "IT Space" matching "Body Space" (score 0.50 < 1.00 threshold). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
e0bb6b718a
commit
ae9a658b0c
@ -741,10 +741,13 @@ def admin_company_detail(company_id):
|
|||||||
# --- Hints: where to find missing data ---
|
# --- Hints: where to find missing data ---
|
||||||
hints = {}
|
hints = {}
|
||||||
analysis = seo_analysis # CompanyWebsiteAnalysis object or None
|
analysis = seo_analysis # CompanyWebsiteAnalysis object or None
|
||||||
|
# Google name for mismatch warning (e.g. "IT Space" vs "Body Space")
|
||||||
|
google_name = (analysis.google_name or '') if analysis else ''
|
||||||
|
google_name_mismatch = bool(google_name and google_name.lower() != company.name.lower())
|
||||||
|
|
||||||
if not company.phone:
|
if not company.phone:
|
||||||
if analysis and analysis.google_phone:
|
if analysis and analysis.google_phone:
|
||||||
hints['Telefon'] = {'source': 'Google Business', 'value': analysis.google_phone, 'action': 'apply'}
|
hints['Telefon'] = {'source': 'Google Business', 'value': analysis.google_phone, 'action': 'apply', 'google_name': google_name if google_name_mismatch else ''}
|
||||||
elif analysis and analysis.nap_on_website:
|
elif analysis and analysis.nap_on_website:
|
||||||
nap = analysis.nap_on_website if isinstance(analysis.nap_on_website, dict) else {}
|
nap = analysis.nap_on_website if isinstance(analysis.nap_on_website, dict) else {}
|
||||||
if nap.get('phone'):
|
if nap.get('phone'):
|
||||||
@ -760,11 +763,11 @@ def admin_company_detail(company_id):
|
|||||||
|
|
||||||
if not company.website:
|
if not company.website:
|
||||||
if analysis and analysis.google_website:
|
if analysis and analysis.google_website:
|
||||||
hints['Strona WWW'] = {'source': 'Google Business', 'value': analysis.google_website, 'action': 'apply'}
|
hints['Strona WWW'] = {'source': 'Google Business', 'value': analysis.google_website, 'action': 'apply', 'google_name': google_name if google_name_mismatch else ''}
|
||||||
|
|
||||||
if not company.address_city:
|
if not company.address_city:
|
||||||
if analysis and analysis.google_address:
|
if analysis and analysis.google_address:
|
||||||
hints['Adres'] = {'source': 'Google Business', 'value': analysis.google_address, 'action': 'apply'}
|
hints['Adres'] = {'source': 'Google Business', 'value': analysis.google_address, 'action': 'apply', 'google_name': google_name if google_name_mismatch else ''}
|
||||||
|
|
||||||
if not company.description_short:
|
if not company.description_short:
|
||||||
if analysis and analysis.content_summary:
|
if analysis and analysis.content_summary:
|
||||||
|
|||||||
@ -183,6 +183,7 @@ def admin_data_quality():
|
|||||||
avg_score = round(score_sum / total) if total > 0 else 0
|
avg_score = round(score_sum / total) if total > 0 else 0
|
||||||
|
|
||||||
# Available data: companies where Google has data but company profile is empty
|
# Available data: companies where Google has data but company profile is empty
|
||||||
|
# Include google_name so admin can verify the match is correct
|
||||||
available_data = []
|
available_data = []
|
||||||
analyses = db.query(CompanyWebsiteAnalysis).all()
|
analyses = db.query(CompanyWebsiteAnalysis).all()
|
||||||
company_map = {c.id: c for c in companies}
|
company_map = {c.id: c for c in companies}
|
||||||
@ -191,20 +192,24 @@ def admin_data_quality():
|
|||||||
comp = company_map.get(a.company_id)
|
comp = company_map.get(a.company_id)
|
||||||
if not comp:
|
if not comp:
|
||||||
continue
|
continue
|
||||||
|
g_name = a.google_name or ''
|
||||||
if a.google_phone and not comp.phone:
|
if a.google_phone and not comp.phone:
|
||||||
available_data.append({
|
available_data.append({
|
||||||
'company_id': comp.id, 'company_name': comp.name, 'company_slug': comp.slug,
|
'company_id': comp.id, 'company_name': comp.name, 'company_slug': comp.slug,
|
||||||
'field': 'Telefon', 'source': 'Google Business', 'value': a.google_phone
|
'field': 'Telefon', 'source': 'Google Business', 'value': a.google_phone,
|
||||||
|
'google_name': g_name,
|
||||||
})
|
})
|
||||||
if a.google_website and not comp.website:
|
if a.google_website and not comp.website:
|
||||||
available_data.append({
|
available_data.append({
|
||||||
'company_id': comp.id, 'company_name': comp.name, 'company_slug': comp.slug,
|
'company_id': comp.id, 'company_name': comp.name, 'company_slug': comp.slug,
|
||||||
'field': 'Strona WWW', 'source': 'Google Business', 'value': a.google_website
|
'field': 'Strona WWW', 'source': 'Google Business', 'value': a.google_website,
|
||||||
|
'google_name': g_name,
|
||||||
})
|
})
|
||||||
if a.google_address and not comp.address_city:
|
if a.google_address and not comp.address_city:
|
||||||
available_data.append({
|
available_data.append({
|
||||||
'company_id': comp.id, 'company_name': comp.name, 'company_slug': comp.slug,
|
'company_id': comp.id, 'company_name': comp.name, 'company_slug': comp.slug,
|
||||||
'field': 'Adres', 'source': 'Google Business', 'value': a.google_address
|
'field': 'Adres', 'source': 'Google Business', 'value': a.google_address,
|
||||||
|
'google_name': g_name,
|
||||||
})
|
})
|
||||||
|
|
||||||
return render_template(
|
return render_template(
|
||||||
|
|||||||
@ -72,7 +72,8 @@ class CompetitorMonitoringService:
|
|||||||
logger.warning(f"Company {company_id} has no Google Place ID, searching by name")
|
logger.warning(f"Company {company_id} has no Google Place ID, searching by name")
|
||||||
# Search for the company first
|
# Search for the company first
|
||||||
place = self.places_service.search_place(
|
place = self.places_service.search_place(
|
||||||
f"{company.name} {company.address_city or 'Wejherowo'}"
|
f"{company.name} {company.address_city or 'Wejherowo'}",
|
||||||
|
company_name=company.name
|
||||||
)
|
)
|
||||||
if not place:
|
if not place:
|
||||||
return []
|
return []
|
||||||
|
|||||||
@ -136,6 +136,38 @@ class GooglePlacesService:
|
|||||||
logger.error(f"Places API request error for {place_id}: {e}")
|
logger.error(f"Places API request error for {place_id}: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _tokenize_name(name: str) -> set:
|
||||||
|
"""Tokenize a company name into significant lowercase words."""
|
||||||
|
import re as _re
|
||||||
|
skip_words = {
|
||||||
|
'sp', 'z', 'o', 'oo', 'sa', 'sc', 'j', 'k', 'ul', 'i', 'w',
|
||||||
|
'do', 'na', 'po', 'ze', 'the', 'and', 'of', 'for', 'group',
|
||||||
|
}
|
||||||
|
# Split on non-alphanumeric, keep words
|
||||||
|
words = _re.findall(r'[a-ząćęłńóśźż0-9]+', name.lower())
|
||||||
|
return {w for w in words if len(w) > 1 and w not in skip_words}
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _name_match_score(company_name: str, google_name: str) -> float:
|
||||||
|
"""
|
||||||
|
Compute name match score between company name and Google result name.
|
||||||
|
|
||||||
|
Returns float 0.0-1.0:
|
||||||
|
- 1.0 = all significant company words found in Google name
|
||||||
|
- 0.0 = no words matched
|
||||||
|
Uses word-boundary matching (not substring) to prevent
|
||||||
|
'IT' matching 'digital' or 'Space' matching 'Body Space' alone.
|
||||||
|
"""
|
||||||
|
company_words = GooglePlacesService._tokenize_name(company_name)
|
||||||
|
google_words = GooglePlacesService._tokenize_name(google_name)
|
||||||
|
|
||||||
|
if not company_words:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
matched = company_words & google_words
|
||||||
|
return len(matched) / len(company_words)
|
||||||
|
|
||||||
def search_place(self, query: str, location_bias: Dict = None,
|
def search_place(self, query: str, location_bias: Dict = None,
|
||||||
company_name: str = None) -> Optional[Dict[str, Any]]:
|
company_name: str = None) -> Optional[Dict[str, Any]]:
|
||||||
"""
|
"""
|
||||||
@ -185,17 +217,32 @@ class GooglePlacesService:
|
|||||||
if not company_name:
|
if not company_name:
|
||||||
return places[0]
|
return places[0]
|
||||||
|
|
||||||
# Validate: at least one significant word from company name must appear in result name
|
# Validate: company name must significantly match Google result name.
|
||||||
skip_words = {'sp', 'z', 'o', 'oo', 'sa', 'sc', 'j', 'k', 'ul', 'i', 'w', 'do', 'na', 'po', 'ze'}
|
# Uses word-boundary matching with minimum threshold:
|
||||||
name_words = {w.lower() for w in company_name.split() if len(w) > 1 and w.lower() not in skip_words}
|
# - Short names (1-2 significant words): ALL words must match
|
||||||
|
# - Longer names (3+ words): at least 50% of words must match
|
||||||
|
company_words = self._tokenize_name(company_name)
|
||||||
|
min_ratio = 1.0 if len(company_words) <= 2 else 0.5
|
||||||
|
|
||||||
|
best_place = None
|
||||||
|
best_score = 0.0
|
||||||
|
|
||||||
for place in places:
|
for place in places:
|
||||||
google_name = place.get('displayName', {}).get('text', '').lower()
|
google_name = place.get('displayName', {}).get('text', '')
|
||||||
if any(word in google_name for word in name_words):
|
score = self._name_match_score(company_name, google_name)
|
||||||
return place
|
if score >= min_ratio and score > best_score:
|
||||||
|
best_score = score
|
||||||
|
best_place = place
|
||||||
|
|
||||||
|
if best_place:
|
||||||
|
matched_name = best_place.get('displayName', {}).get('text', '')
|
||||||
|
logger.info(
|
||||||
|
f"Name match for '{company_name}': '{matched_name}' (score={best_score:.2f})"
|
||||||
|
)
|
||||||
|
return best_place
|
||||||
|
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"No name match for '{company_name}' in Google results: "
|
f"No name match for '{company_name}' (min_ratio={min_ratio:.0%}) in Google results: "
|
||||||
f"{[p.get('displayName', {}).get('text', '') for p in places]}"
|
f"{[p.get('displayName', {}).get('text', '') for p in places]}"
|
||||||
)
|
)
|
||||||
return None
|
return None
|
||||||
|
|||||||
@ -841,6 +841,11 @@
|
|||||||
<span style="font-size: var(--font-size-xs); color: var(--primary);">
|
<span style="font-size: var(--font-size-xs); color: var(--primary);">
|
||||||
{{ hints[field_name].source }}{% if hints[field_name].value %}: {{ hints[field_name].value[:40] }}{% endif %}
|
{{ hints[field_name].source }}{% if hints[field_name].value %}: {{ hints[field_name].value[:40] }}{% endif %}
|
||||||
</span>
|
</span>
|
||||||
|
{% if hints[field_name].get('google_name') %}
|
||||||
|
<span style="font-size: var(--font-size-xs); color: #d97706;" title="Nazwa profilu Google nie zgadza się z nazwą firmy — zweryfikuj!">
|
||||||
|
(profil: {{ hints[field_name].google_name[:30] }})
|
||||||
|
</span>
|
||||||
|
{% endif %}
|
||||||
{% if hints[field_name].action == 'apply' and hints[field_name].value %}
|
{% if hints[field_name].action == 'apply' and hints[field_name].value %}
|
||||||
<button onclick="applyHint({{ company.id }}, '{{ field_name }}', '{{ hints[field_name].value|e }}')"
|
<button onclick="applyHint({{ company.id }}, '{{ field_name }}', '{{ hints[field_name].value|e }}')"
|
||||||
class="hint-apply-btn" title="Uzupełnij to pole">Uzupełnij</button>
|
class="hint-apply-btn" title="Uzupełnij to pole">Uzupełnij</button>
|
||||||
|
|||||||
@ -582,7 +582,12 @@
|
|||||||
<tr id="avail-row-{{ loop.index }}">
|
<tr id="avail-row-{{ loop.index }}">
|
||||||
<td><a href="{{ url_for('admin.admin_company_detail', company_id=item.company_id) }}" class="dq-company-link">{{ item.company_name }}</a></td>
|
<td><a href="{{ url_for('admin.admin_company_detail', company_id=item.company_id) }}" class="dq-company-link">{{ item.company_name }}</a></td>
|
||||||
<td>{{ item.field }}</td>
|
<td>{{ item.field }}</td>
|
||||||
<td><span style="font-size: var(--font-size-xs); color: var(--text-secondary);">{{ item.source }}</span></td>
|
<td>
|
||||||
|
<span style="font-size: var(--font-size-xs); color: var(--text-secondary);">{{ item.source }}</span>
|
||||||
|
{% if item.google_name and item.google_name.lower() != item.company_name.lower() %}
|
||||||
|
<br><span style="font-size: var(--font-size-xs); color: #d97706;" title="Nazwa profilu Google — zweryfikuj dopasowanie">Profil: {{ item.google_name[:40] }}</span>
|
||||||
|
{% endif %}
|
||||||
|
</td>
|
||||||
<td style="font-size: var(--font-size-sm);">{{ item.value[:50] }}</td>
|
<td style="font-size: var(--font-size-sm);">{{ item.value[:50] }}</td>
|
||||||
<td>
|
<td>
|
||||||
<button class="hint-apply-btn" onclick="applyAvailableHint({{ item.company_id }}, '{{ item.field }}', '{{ item.value|e }}', 'avail-row-{{ loop.index }}')"
|
<button class="hint-apply-btn" onclick="applyAvailableHint({{ item.company_id }}, '{{ item.field }}', '{{ item.value|e }}', 'avail-row-{{ loop.index }}')"
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user