fix: improve Google Places name tokenizer and matching
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions

- Treat & as word connector (P&P, S&K stay as single tokens)
- Add prefix matching with legal suffix stripping (Sp. z o.o., S.A.)
- Add reverse prefix for brand vs legal name (Pixlab Softwarehouse ↔ Pixlab Sp. z o.o.)
- Compound names like TERMO-BUD still correctly rejected (no space separator)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Maciej Pienczyn 2026-02-21 08:00:08 +01:00
parent e91c9d38f1
commit 01bc40132e

View File

@ -144,21 +144,43 @@ class GooglePlacesService:
'sp', 'z', 'o', 'oo', 'sa', 'sc', 'j', 'k', 'ul', 'i', 'w',
'do', 'na', 'po', 'ze', 'the', 'and', 'of', 'for', 'group',
}
# Split on non-alphanumeric, keep words
words = _re.findall(r'[a-ząćęłńóśźż0-9]+', name.lower())
# Split on non-alphanumeric, keep words; treat & as connector (P&P, S&K)
words = _re.findall(r'[a-ząćęłńóśźż0-9]+(?:&[a-ząćęłńóśźż0-9]+)*', name.lower())
return {w for w in words if len(w) > 1 and w not in skip_words}
@staticmethod
def _name_match_score(company_name: str, google_name: str) -> float:
"""
Compute bidirectional name match score between company name and Google result.
Compute name match score with prefix detection and bidirectional fallback.
Uses max(company_words, google_words) as denominator so that
extra words in either name lower the score:
- "TERMO" vs "TERMO-BUD" 1/max(1,2) = 0.50
- "TERMO" vs "TERMO" 1/max(1,1) = 1.00
- "IT Space" vs "IT Space" 2/max(2,2) = 1.00
1. If Google name starts with company name (word boundary) 1.0
"INPI" matches "INPI - Infrastruktura IT" (same company, extra description)
"TERMO" does NOT match "TERMO-BUD" (compound name, no space separator)
2. Otherwise, bidirectional word matching with max() denominator.
"""
import re as _re
cn = company_name.strip().lower()
gn = google_name.strip().lower()
# Strip legal forms for prefix comparison (Sp. z o.o., S.A., Sp.j., etc.)
clean_cn = _re.sub(
r'\s*(sp\.?\s*z\.?\s*o\.?\s*o\.?|sp\.?\s*[jkp]\.?|s\.?\s*[ac]\.?)\s*\.?\s*$',
'', cn, flags=_re.IGNORECASE
).strip() or cn
# Also strip legal forms from Google name for reverse prefix check
clean_gn = _re.sub(
r'\s*(sp\.?\s*z\.?\s*o\.?\s*o\.?|sp\.?\s*[jkp]\.?|s\.?\s*[ac]\.?)\s*\.?\s*$',
'', gn, flags=_re.IGNORECASE
).strip() or gn
# Prefix check: company name at start of Google name, or vice versa,
# followed by space, period, comma, or end — NOT dash (compound names)
wb = r'(?:[\s.,;:]|$)'
if clean_cn and (_re.match(_re.escape(clean_cn) + wb, gn) or
_re.match(_re.escape(clean_gn) + wb, clean_cn)):
return 1.0
company_words = GooglePlacesService._tokenize_name(company_name)
google_words = GooglePlacesService._tokenize_name(google_name)