fix: Prioritize LinkedIn company pages over personal profiles
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions

- Add direct URL check for linkedin.com/company/{slug} before Brave Search
- Prioritize /company/ over /in/ in search result ranking
- Use targeted query "company_name linkedin.com/company" first
- Fall back to personal profile search only if company page not found
- Verify page title matches company name to avoid false positives

Fixes: WATERM showed employee's personal profile instead of existing
company page at linkedin.com/company/waterm

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Maciej Pienczyn 2026-02-17 11:08:13 +01:00
parent 0f353b1c54
commit 990c6537cb

View File

@ -724,7 +724,7 @@ class BraveSearcher:
('instagram', f'{company_name} instagram'),
('tiktok', f'{company_name} tiktok'),
('youtube', f'{company_name} youtube kanał'),
('linkedin', f'{company_name} linkedin'),
('linkedin', f'{company_name} linkedin.com/company'),
]
for platform, query in platforms:
@ -736,6 +736,22 @@ class BraveSearcher:
except Exception as e:
logger.warning(f'Brave search failed for {platform}: {e}')
# LinkedIn: try direct URL check first, then Brave Search fallback
if 'linkedin' not in results or '/in/' in results.get('linkedin', ''):
direct_url = self._check_linkedin_company_page(company_name)
if direct_url:
results['linkedin'] = direct_url
logger.info(f"LinkedIn direct URL check found company page: {direct_url}")
elif 'linkedin' not in results:
# Last resort: search for any LinkedIn profile (personal included)
try:
url = self._search_brave(f'{company_name} linkedin', 'linkedin', company_name)
if url:
results['linkedin'] = url
logger.info(f"LinkedIn fallback found profile: {url}")
except Exception as e:
logger.warning(f'Brave search LinkedIn fallback failed: {e}')
return results
def search_google_reviews(self, company_name: str, city: str = 'Wejherowo') -> Dict[str, Any]:
@ -972,13 +988,16 @@ class BraveSearcher:
if not extracted_url:
extracted_url = result_url
candidates.append((token_matches, extracted_url))
# For LinkedIn: prioritize /company/ over /in/ (company pages > personal)
is_company_page = 1 if (platform == 'linkedin' and '/company/' in (extracted_url or '')) else 0
candidates.append((is_company_page, token_matches, extracted_url))
if candidates:
# Sort by number of token matches (best match first)
candidates.sort(key=lambda x: x[0], reverse=True)
best_url = candidates[0][1]
logger.info(f"Brave search matched {platform}: {best_url} (score: {candidates[0][0]}/{len(name_tokens)})")
# Sort by: 1) company page priority, 2) token matches (best match first)
candidates.sort(key=lambda x: (x[0], x[1]), reverse=True)
best_url = candidates[0][2]
logger.info(f"Brave search matched {platform}: {best_url} (company={candidates[0][0]}, score={candidates[0][1]}/{len(name_tokens)})")
return best_url
logger.debug(f"No {platform} profile found in Brave results for: {query}")
@ -994,6 +1013,49 @@ class BraveSearcher:
logger.warning(f"Error parsing Brave results for '{query}': {e}")
return None
def _check_linkedin_company_page(self, company_name: str) -> Optional[str]:
"""
Try direct LinkedIn company page URL based on company name slugs.
Returns URL if page exists and title matches, None otherwise.
"""
# Generate slug candidates from company name
name_clean = company_name.strip()
slugs = set()
# Basic slug: lowercase, spaces to hyphens
slug = re.sub(r'[^a-z0-9\s-]', '', name_clean.lower())
slug = re.sub(r'\s+', '-', slug).strip('-')
if slug:
slugs.add(slug)
# First word only (common for short brand names like "Waterm")
first_word = name_clean.split()[0].lower() if name_clean.split() else ''
first_word = re.sub(r'[^a-z0-9]', '', first_word)
if first_word and len(first_word) >= 3:
slugs.add(first_word)
name_tokens = [re.compile(r'\b' + re.escape(t) + r'\b', re.IGNORECASE)
for t in name_clean.lower().split() if len(t) >= 3]
for slug in slugs:
try:
check_url = f'https://www.linkedin.com/company/{slug}'
resp = self.session.get(check_url, timeout=8, allow_redirects=True)
if resp.status_code == 200:
# Verify title contains company name
title_match = re.search(r'<title>([^<]+)</title>', resp.text)
if title_match:
title = title_match.group(1).lower()
if any(t.search(title) for t in name_tokens):
logger.info(f"LinkedIn company page verified: {check_url} (title: {title_match.group(1)})")
return f'https://linkedin.com/company/{slug}'
else:
logger.debug(f"LinkedIn /company/{slug} exists but title '{title_match.group(1)}' doesn't match '{company_name}'")
except Exception as e:
logger.debug(f"LinkedIn company page check failed for {slug}: {e}")
return None
@staticmethod
def _build_social_url(platform: str, handle: str) -> str:
"""Build normalized social media URL from platform and handle."""