revert: Remove city-aware token matching, keep handle-based exclude fix
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions

City tokens caused too many false positives (matching any business from
the same city). Reverted to name-only matching. The exclude fix
(checking handle instead of full URL substring) is preserved as it
fixes a genuine bug where 'p' in exclude list matched any URL.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Maciej Pienczyn 2026-02-18 18:05:40 +01:00
parent 9892e07b26
commit f2f65abca2

View File

@ -729,7 +729,7 @@ class BraveSearcher:
for platform, query in platforms:
try:
url = self._search_brave(query, platform, company_name, city)
url = self._search_brave(query, platform, company_name)
if url:
results[platform] = url
time.sleep(0.5) # Rate limiting
@ -894,10 +894,10 @@ class BraveSearcher:
logger.warning(f"Error parsing Brave results for '{company_name}': {e}")
return None
def _search_brave(self, query: str, platform: str, company_name: str = '', city: str = '') -> Optional[str]:
def _search_brave(self, query: str, platform: str, company_name: str = '', **kwargs) -> Optional[str]:
"""
Perform Brave search and extract relevant social media URL.
Validates results against company_name (and optionally city) to avoid false matches.
Validates results against company_name to avoid false matches.
Returns normalized URL for the platform or None.
"""
if not self.api_key:
@ -939,15 +939,12 @@ class BraveSearcher:
if not pattern:
return None
# Prepare company name tokens (weight=2) and city tokens (weight=1)
# Prepare company name variations for matching
name_lower = company_name.lower().strip()
# Generate matching tokens with word boundary patterns
# (e.g. "Waterm Artur Wiertel" -> [r'\bwaterm\b', r'\bartur\b', r'\bwiertel\b'])
name_tokens = [re.compile(r'\b' + re.escape(t) + r'\b', re.IGNORECASE)
for t in name_lower.split() if len(t) >= 3]
city_tokens = []
if city:
city_tokens = [re.compile(r'\b' + re.escape(t) + r'\b', re.IGNORECASE)
for t in city.lower().strip().split() if len(t) >= 3]
candidates = []
for rank, result in enumerate(results):
@ -982,27 +979,22 @@ class BraveSearcher:
# Check if result relates to the company
searchable = f'{result_title} {result_desc} {result_url}'.lower()
# Name tokens: weight 2 each, city tokens: weight 1 each
name_score = sum(2 for t in name_tokens if t.search(searchable))
city_score = sum(1 for t in city_tokens if t.search(searchable))
total_score = name_score + city_score
# Count how many name tokens appear in the result (word boundary match)
token_matches = sum(1 for t in name_tokens if t.search(searchable))
# Accept if: name matches (score>=2), or city match on top-3 result (score>=1)
if total_score == 0:
continue
if name_score == 0 and rank >= 3:
continue # City-only match outside top 3 → skip (false positive risk)
if token_matches == 0:
continue # No connection to company at all
# For LinkedIn: prioritize /company/ over /in/ (company pages > personal)
is_company_page = 1 if (platform == 'linkedin' and '/company/' in (extracted_url or '')) else 0
candidates.append((is_company_page, total_score, extracted_url))
candidates.append((is_company_page, token_matches, extracted_url))
if candidates:
# Sort by: 1) company page priority, 2) total score (best match first)
# Sort by: 1) company page priority, 2) token matches (best match first)
candidates.sort(key=lambda x: (x[0], x[1]), reverse=True)
best_url = candidates[0][2]
logger.info(f"Brave search matched {platform}: {best_url} (company={candidates[0][0]}, score={candidates[0][1]}/{len(name_tokens)*2+len(city_tokens)})")
logger.info(f"Brave search matched {platform}: {best_url} (company={candidates[0][0]}, score={candidates[0][1]}/{len(name_tokens)})")
return best_url
logger.debug(f"No {platform} profile found in Brave results for: {query}")