revert: Remove city-aware token matching, keep handle-based exclude fix
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
City tokens caused too many false positives (matching any business from the same city). Reverted to name-only matching. The exclude fix (checking handle instead of full URL substring) is preserved as it fixes a genuine bug where 'p' in exclude list matched any URL. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
9892e07b26
commit
f2f65abca2
@ -729,7 +729,7 @@ class BraveSearcher:
|
||||
|
||||
for platform, query in platforms:
|
||||
try:
|
||||
url = self._search_brave(query, platform, company_name, city)
|
||||
url = self._search_brave(query, platform, company_name)
|
||||
if url:
|
||||
results[platform] = url
|
||||
time.sleep(0.5) # Rate limiting
|
||||
@ -894,10 +894,10 @@ class BraveSearcher:
|
||||
logger.warning(f"Error parsing Brave results for '{company_name}': {e}")
|
||||
return None
|
||||
|
||||
def _search_brave(self, query: str, platform: str, company_name: str = '', city: str = '') -> Optional[str]:
|
||||
def _search_brave(self, query: str, platform: str, company_name: str = '', **kwargs) -> Optional[str]:
|
||||
"""
|
||||
Perform Brave search and extract relevant social media URL.
|
||||
Validates results against company_name (and optionally city) to avoid false matches.
|
||||
Validates results against company_name to avoid false matches.
|
||||
Returns normalized URL for the platform or None.
|
||||
"""
|
||||
if not self.api_key:
|
||||
@ -939,15 +939,12 @@ class BraveSearcher:
|
||||
if not pattern:
|
||||
return None
|
||||
|
||||
# Prepare company name tokens (weight=2) and city tokens (weight=1)
|
||||
# Prepare company name variations for matching
|
||||
name_lower = company_name.lower().strip()
|
||||
# Generate matching tokens with word boundary patterns
|
||||
# (e.g. "Waterm Artur Wiertel" -> [r'\bwaterm\b', r'\bartur\b', r'\bwiertel\b'])
|
||||
name_tokens = [re.compile(r'\b' + re.escape(t) + r'\b', re.IGNORECASE)
|
||||
for t in name_lower.split() if len(t) >= 3]
|
||||
city_tokens = []
|
||||
if city:
|
||||
city_tokens = [re.compile(r'\b' + re.escape(t) + r'\b', re.IGNORECASE)
|
||||
for t in city.lower().strip().split() if len(t) >= 3]
|
||||
|
||||
candidates = []
|
||||
for rank, result in enumerate(results):
|
||||
@ -982,27 +979,22 @@ class BraveSearcher:
|
||||
|
||||
# Check if result relates to the company
|
||||
searchable = f'{result_title} {result_desc} {result_url}'.lower()
|
||||
# Name tokens: weight 2 each, city tokens: weight 1 each
|
||||
name_score = sum(2 for t in name_tokens if t.search(searchable))
|
||||
city_score = sum(1 for t in city_tokens if t.search(searchable))
|
||||
total_score = name_score + city_score
|
||||
# Count how many name tokens appear in the result (word boundary match)
|
||||
token_matches = sum(1 for t in name_tokens if t.search(searchable))
|
||||
|
||||
# Accept if: name matches (score>=2), or city match on top-3 result (score>=1)
|
||||
if total_score == 0:
|
||||
continue
|
||||
if name_score == 0 and rank >= 3:
|
||||
continue # City-only match outside top 3 → skip (false positive risk)
|
||||
if token_matches == 0:
|
||||
continue # No connection to company at all
|
||||
|
||||
# For LinkedIn: prioritize /company/ over /in/ (company pages > personal)
|
||||
is_company_page = 1 if (platform == 'linkedin' and '/company/' in (extracted_url or '')) else 0
|
||||
|
||||
candidates.append((is_company_page, total_score, extracted_url))
|
||||
candidates.append((is_company_page, token_matches, extracted_url))
|
||||
|
||||
if candidates:
|
||||
# Sort by: 1) company page priority, 2) total score (best match first)
|
||||
# Sort by: 1) company page priority, 2) token matches (best match first)
|
||||
candidates.sort(key=lambda x: (x[0], x[1]), reverse=True)
|
||||
best_url = candidates[0][2]
|
||||
logger.info(f"Brave search matched {platform}: {best_url} (company={candidates[0][0]}, score={candidates[0][1]}/{len(name_tokens)*2+len(city_tokens)})")
|
||||
logger.info(f"Brave search matched {platform}: {best_url} (company={candidates[0][0]}, score={candidates[0][1]}/{len(name_tokens)})")
|
||||
return best_url
|
||||
|
||||
logger.debug(f"No {platform} profile found in Brave results for: {query}")
|
||||
|
||||
Loading…
Reference in New Issue
Block a user