fix: improve Google Places name tokenizer and matching

- Treat & as word connector (P&P, S&K stay as single tokens) - Add prefix matching with legal suffix stripping (Sp. z o.o., S.A.) - Add reverse prefix for brand vs legal name (Pixlab Softwarehouse ↔ Pixlab Sp. z o.o.) - Compound names like TERMO-BUD still correctly rejected (no space separator) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-21 08:00:08 +01:00 · 2026-02-21 08:00:08 +01:00 · 01bc40132e
commit 01bc40132e
parent e91c9d38f1
1 changed files with 30 additions and 8 deletions
--- a/google_places_service.py
+++ b/google_places_service.py
@ -144,21 +144,43 @@ class GooglePlacesService:
            'sp', 'z', 'o', 'oo', 'sa', 'sc', 'j', 'k', 'ul', 'i', 'w',
            'do', 'na', 'po', 'ze', 'the', 'and', 'of', 'for', 'group',
        }
-        # Split on non-alphanumeric, keep words
-        words = _re.findall(r'[a-ząćęłńóśźż0-9]+', name.lower())
+        # Split on non-alphanumeric, keep words; treat & as connector (P&P, S&K)
+        words = _re.findall(r'[a-ząćęłńóśźż0-9]+(?:&[a-ząćęłńóśźż0-9]+)*', name.lower())
        return {w for w in words if len(w) > 1 and w not in skip_words}

    @staticmethod
    def _name_match_score(company_name: str, google_name: str) -> float:
        """
-        Compute bidirectional name match score between company name and Google result.
+        Compute name match score with prefix detection and bidirectional fallback.

-        Uses max(company_words, google_words) as denominator so that
-        extra words in either name lower the score:
-          - "TERMO" vs "TERMO-BUD" → 1/max(1,2) = 0.50
-          - "TERMO" vs "TERMO"     → 1/max(1,1) = 1.00
-          - "IT Space" vs "IT Space" → 2/max(2,2) = 1.00
+        1. If Google name starts with company name (word boundary) → 1.0
+           "INPI" matches "INPI - Infrastruktura IT" (same company, extra description)
+           "TERMO" does NOT match "TERMO-BUD" (compound name, no space separator)
+        2. Otherwise, bidirectional word matching with max() denominator.
        """
+        import re as _re
+        cn = company_name.strip().lower()
+        gn = google_name.strip().lower()
+
+        # Strip legal forms for prefix comparison (Sp. z o.o., S.A., Sp.j., etc.)
+        clean_cn = _re.sub(
+            r'\s*(sp\.?\s*z\.?\s*o\.?\s*o\.?|sp\.?\s*[jkp]\.?|s\.?\s*[ac]\.?)\s*\.?\s*$',
+            '', cn, flags=_re.IGNORECASE
+        ).strip() or cn
+
+        # Also strip legal forms from Google name for reverse prefix check
+        clean_gn = _re.sub(
+            r'\s*(sp\.?\s*z\.?\s*o\.?\s*o\.?|sp\.?\s*[jkp]\.?|s\.?\s*[ac]\.?)\s*\.?\s*$',
+            '', gn, flags=_re.IGNORECASE
+        ).strip() or gn
+
+        # Prefix check: company name at start of Google name, or vice versa,
+        # followed by space, period, comma, or end — NOT dash (compound names)
+        wb = r'(?:[\s.,;:]|$)'
+        if clean_cn and (_re.match(_re.escape(clean_cn) + wb, gn) or
+                         _re.match(_re.escape(clean_gn) + wb, clean_cn)):
+            return 1.0
+
        company_words = GooglePlacesService._tokenize_name(company_name)
        google_words = GooglePlacesService._tokenize_name(google_name)