fix: Prioritize LinkedIn company pages over personal profiles
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
- Add direct URL check for linkedin.com/company/{slug} before Brave Search
- Prioritize /company/ over /in/ in search result ranking
- Use targeted query "company_name linkedin.com/company" first
- Fall back to personal profile search only if company page not found
- Verify page title matches company name to avoid false positives
Fixes: WATERM showed employee's personal profile instead of existing
company page at linkedin.com/company/waterm
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
0f353b1c54
commit
990c6537cb
@ -724,7 +724,7 @@ class BraveSearcher:
|
||||
('instagram', f'{company_name} instagram'),
|
||||
('tiktok', f'{company_name} tiktok'),
|
||||
('youtube', f'{company_name} youtube kanał'),
|
||||
('linkedin', f'{company_name} linkedin'),
|
||||
('linkedin', f'{company_name} linkedin.com/company'),
|
||||
]
|
||||
|
||||
for platform, query in platforms:
|
||||
@ -736,6 +736,22 @@ class BraveSearcher:
|
||||
except Exception as e:
|
||||
logger.warning(f'Brave search failed for {platform}: {e}')
|
||||
|
||||
# LinkedIn: try direct URL check first, then Brave Search fallback
|
||||
if 'linkedin' not in results or '/in/' in results.get('linkedin', ''):
|
||||
direct_url = self._check_linkedin_company_page(company_name)
|
||||
if direct_url:
|
||||
results['linkedin'] = direct_url
|
||||
logger.info(f"LinkedIn direct URL check found company page: {direct_url}")
|
||||
elif 'linkedin' not in results:
|
||||
# Last resort: search for any LinkedIn profile (personal included)
|
||||
try:
|
||||
url = self._search_brave(f'{company_name} linkedin', 'linkedin', company_name)
|
||||
if url:
|
||||
results['linkedin'] = url
|
||||
logger.info(f"LinkedIn fallback found profile: {url}")
|
||||
except Exception as e:
|
||||
logger.warning(f'Brave search LinkedIn fallback failed: {e}')
|
||||
|
||||
return results
|
||||
|
||||
def search_google_reviews(self, company_name: str, city: str = 'Wejherowo') -> Dict[str, Any]:
|
||||
@ -972,13 +988,16 @@ class BraveSearcher:
|
||||
if not extracted_url:
|
||||
extracted_url = result_url
|
||||
|
||||
candidates.append((token_matches, extracted_url))
|
||||
# For LinkedIn: prioritize /company/ over /in/ (company pages > personal)
|
||||
is_company_page = 1 if (platform == 'linkedin' and '/company/' in (extracted_url or '')) else 0
|
||||
|
||||
candidates.append((is_company_page, token_matches, extracted_url))
|
||||
|
||||
if candidates:
|
||||
# Sort by number of token matches (best match first)
|
||||
candidates.sort(key=lambda x: x[0], reverse=True)
|
||||
best_url = candidates[0][1]
|
||||
logger.info(f"Brave search matched {platform}: {best_url} (score: {candidates[0][0]}/{len(name_tokens)})")
|
||||
# Sort by: 1) company page priority, 2) token matches (best match first)
|
||||
candidates.sort(key=lambda x: (x[0], x[1]), reverse=True)
|
||||
best_url = candidates[0][2]
|
||||
logger.info(f"Brave search matched {platform}: {best_url} (company={candidates[0][0]}, score={candidates[0][1]}/{len(name_tokens)})")
|
||||
return best_url
|
||||
|
||||
logger.debug(f"No {platform} profile found in Brave results for: {query}")
|
||||
@ -994,6 +1013,49 @@ class BraveSearcher:
|
||||
logger.warning(f"Error parsing Brave results for '{query}': {e}")
|
||||
return None
|
||||
|
||||
def _check_linkedin_company_page(self, company_name: str) -> Optional[str]:
|
||||
"""
|
||||
Try direct LinkedIn company page URL based on company name slugs.
|
||||
Returns URL if page exists and title matches, None otherwise.
|
||||
"""
|
||||
# Generate slug candidates from company name
|
||||
name_clean = company_name.strip()
|
||||
slugs = set()
|
||||
|
||||
# Basic slug: lowercase, spaces to hyphens
|
||||
slug = re.sub(r'[^a-z0-9\s-]', '', name_clean.lower())
|
||||
slug = re.sub(r'\s+', '-', slug).strip('-')
|
||||
if slug:
|
||||
slugs.add(slug)
|
||||
|
||||
# First word only (common for short brand names like "Waterm")
|
||||
first_word = name_clean.split()[0].lower() if name_clean.split() else ''
|
||||
first_word = re.sub(r'[^a-z0-9]', '', first_word)
|
||||
if first_word and len(first_word) >= 3:
|
||||
slugs.add(first_word)
|
||||
|
||||
name_tokens = [re.compile(r'\b' + re.escape(t) + r'\b', re.IGNORECASE)
|
||||
for t in name_clean.lower().split() if len(t) >= 3]
|
||||
|
||||
for slug in slugs:
|
||||
try:
|
||||
check_url = f'https://www.linkedin.com/company/{slug}'
|
||||
resp = self.session.get(check_url, timeout=8, allow_redirects=True)
|
||||
if resp.status_code == 200:
|
||||
# Verify title contains company name
|
||||
title_match = re.search(r'<title>([^<]+)</title>', resp.text)
|
||||
if title_match:
|
||||
title = title_match.group(1).lower()
|
||||
if any(t.search(title) for t in name_tokens):
|
||||
logger.info(f"LinkedIn company page verified: {check_url} (title: {title_match.group(1)})")
|
||||
return f'https://linkedin.com/company/{slug}'
|
||||
else:
|
||||
logger.debug(f"LinkedIn /company/{slug} exists but title '{title_match.group(1)}' doesn't match '{company_name}'")
|
||||
except Exception as e:
|
||||
logger.debug(f"LinkedIn company page check failed for {slug}: {e}")
|
||||
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _build_social_url(platform: str, handle: str) -> str:
|
||||
"""Build normalized social media URL from platform and handle."""
|
||||
|
||||
Loading…
Reference in New Issue
Block a user