fix: Implement Brave Search for LinkedIn detection and fix URL construction
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions

- Replace placeholder _search_brave() with real Brave API integration
- Fix LinkedIn URL construction: /in/ profiles were incorrectly built as /company/
- Add word-boundary matching to validate search results against company name
- Track source (website_scrape vs brave_search) per platform in audit results
- Increase search results from 5 to 10 for better coverage

Fixes: WATERM LinkedIn profile not detected (website has no LinkedIn link,
but Brave Search finds the personal /in/ profile)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Maciej Pienczyn 2026-02-17 10:52:09 +01:00
parent d3b59b824e
commit 6633b94644

View File

@ -129,8 +129,8 @@ SOCIAL_MEDIA_PATTERNS = {
r'(?:https?://)?(?:www\.)?youtube\.com/([^/?\s"\'<>]+)',
],
'linkedin': [
r'(?:https?://)?(?:www\.|pl\.)?linkedin\.com/company/([^/?\s"\'<>]+)',
r'(?:https?://)?(?:www\.|pl\.)?linkedin\.com/in/([^/?\s"\'<>]+)',
r'(?:https?://)?(?:www\.|pl\.)?linkedin\.com/(company/[^/?\s"\'<>]+)',
r'(?:https?://)?(?:www\.|pl\.)?linkedin\.com/(in/[^/?\s"\'<>]+)',
],
'tiktok': [
r'(?:https?://)?(?:www\.)?tiktok\.com/@([^/?\s"\'<>]+)',
@ -145,7 +145,7 @@ SOCIAL_MEDIA_EXCLUDE = {
'facebook': ['sharer', 'share', 'intent', 'plugins', 'dialog', 'sharer.php', 'login', 'pages', 'boldthemes', 'profile.php', 'profile', 'watch', 'groups', 'events', 'marketplace', 'gaming', 'stories', 'p', 'people', 'hashtag', 'help', 'settings', 'notifications', 'tr', 'privacy', 'policies', 'ads', 'business', 'legal', 'flx'],
'instagram': ['explore', 'accounts', 'p', 'reel'],
'youtube': ['embed', 'watch', 'playlist', 'results', 'feed', 'channel', 'c', 'user', '@', 'about', 'featured', 'videos', 'shorts', 'streams', 'playlists', 'community', 'channels', 'store'],
'linkedin': ['shareArticle', 'share', 'login'],
'linkedin': ['company/shareArticle', 'company/share', 'company/login', 'in/shareArticle', 'in/share', 'in/login'],
'tiktok': ['embed', 'video'],
'twitter': ['intent', 'share', 'widgets.js', 'widgets', 'tweet', 'platform.twitter.com', 'bold_themes', 'boldthemes'],
}
@ -478,7 +478,7 @@ class WebsiteAuditor:
else:
url = f'https://youtube.com/channel/{match}'
elif platform == 'linkedin':
url = f'https://linkedin.com/company/{match}'
url = f'https://linkedin.com/{match}'
elif platform == 'tiktok':
url = f'https://tiktok.com/@{match}'
elif platform == 'twitter':
@ -729,7 +729,7 @@ class BraveSearcher:
for platform, query in platforms:
try:
url = self._search_brave(query, platform)
url = self._search_brave(query, platform, company_name)
if url:
results[platform] = url
time.sleep(0.5) # Rate limiting
@ -884,14 +884,137 @@ class BraveSearcher:
logger.warning(f"Error parsing Brave results for '{company_name}': {e}")
return None
def _search_brave(self, query: str, platform: str) -> Optional[str]:
def _search_brave(self, query: str, platform: str, company_name: str = '') -> Optional[str]:
"""
Perform Brave search and extract relevant URL.
Note: This is a placeholder - actual implementation would use Brave API.
Perform Brave search and extract relevant social media URL.
Validates results against company_name to avoid false matches.
Returns normalized URL for the platform or None.
"""
# Placeholder for Brave Search API integration
# In production, this would call the Brave Search API
return None
if not self.api_key:
logger.debug(f"No Brave API key - skipping search for {platform}")
return None
try:
url = 'https://api.search.brave.com/res/v1/web/search'
headers = {
'Accept': 'application/json',
'Accept-Encoding': 'gzip',
'X-Subscription-Token': self.api_key,
}
params = {
'q': query,
'count': 10,
'country': 'pl',
'search_lang': 'pl',
'ui_lang': 'pl-PL',
}
response = self.session.get(url, headers=headers, params=params, timeout=REQUEST_TIMEOUT)
response.raise_for_status()
data = response.json()
results = data.get('web', {}).get('results', [])
# Platform domain patterns
domain_patterns = {
'facebook': r'facebook\.com/',
'instagram': r'instagram\.com/',
'youtube': r'youtube\.com/',
'linkedin': r'linkedin\.com/(?:company|in)/',
'tiktok': r'tiktok\.com/@',
'twitter': r'(?:twitter|x)\.com/',
}
pattern = domain_patterns.get(platform)
if not pattern:
return None
# Prepare company name variations for matching
name_lower = company_name.lower().strip()
# Generate matching tokens with word boundary patterns
# (e.g. "Waterm Artur Wiertel" -> [r'\bwaterm\b', r'\bartur\b', r'\bwiertel\b'])
name_tokens = [re.compile(r'\b' + re.escape(t) + r'\b', re.IGNORECASE)
for t in name_lower.split() if len(t) >= 3]
candidates = []
for result in results:
result_url = result.get('url', '')
result_title = result.get('title', '')
result_desc = result.get('description', '')
if not re.search(pattern, result_url, re.IGNORECASE):
continue
# Validate it's a real profile, not a search/share page
excludes = SOCIAL_MEDIA_EXCLUDE.get(platform, [])
is_excluded = any(ex.lower() in result_url.lower() for ex in excludes)
if is_excluded:
continue
# Check if result relates to the company
searchable = f'{result_title} {result_desc} {result_url}'.lower()
# Count how many name tokens appear in the result (word boundary match)
token_matches = sum(1 for t in name_tokens if t.search(searchable))
if token_matches == 0:
continue # No connection to company at all
# Extract handle using platform patterns
extracted_url = None
for regex in SOCIAL_MEDIA_PATTERNS.get(platform, []):
match = re.search(regex, result_url, re.IGNORECASE)
if match:
handle = match.group(1)
if len(handle) >= 2:
extracted_url = self._build_social_url(platform, handle)
break
if not extracted_url:
extracted_url = result_url
candidates.append((token_matches, extracted_url))
if candidates:
# Sort by number of token matches (best match first)
candidates.sort(key=lambda x: x[0], reverse=True)
best_url = candidates[0][1]
logger.info(f"Brave search matched {platform}: {best_url} (score: {candidates[0][0]}/{len(name_tokens)})")
return best_url
logger.debug(f"No {platform} profile found in Brave results for: {query}")
return None
except requests.exceptions.Timeout:
logger.warning(f"Timeout searching Brave for '{query}'")
return None
except requests.exceptions.RequestException as e:
logger.warning(f"Brave API request failed for '{query}': {e}")
return None
except Exception as e:
logger.warning(f"Error parsing Brave results for '{query}': {e}")
return None
@staticmethod
def _build_social_url(platform: str, handle: str) -> str:
"""Build normalized social media URL from platform and handle."""
if platform == 'facebook':
if handle.isdigit():
return f'https://facebook.com/profile.php?id={handle}'
return f'https://facebook.com/{handle}'
elif platform == 'instagram':
handle = handle.split('?')[0].split('&')[0]
return f'https://instagram.com/{handle}'
elif platform == 'youtube':
if handle.startswith('@'):
return f'https://youtube.com/{handle}'
return f'https://youtube.com/channel/{handle}'
elif platform == 'linkedin':
return f'https://linkedin.com/{handle}'
elif platform == 'tiktok':
return f'https://tiktok.com/@{handle}'
elif platform == 'twitter':
return f'https://twitter.com/{handle}'
return handle
class SocialProfileEnricher:
@ -1212,8 +1335,11 @@ class SocialMediaAuditor:
# 2. Social media from website
website_social = result['website'].get('social_media_links', {})
social_sources = {} # Track source per platform
if website_social:
logger.info(f"Social media found on website: {list(website_social.keys())}")
for p in website_social:
social_sources[p] = 'website_scrape'
else:
logger.info("No social media links found on website")
@ -1230,12 +1356,14 @@ class SocialMediaAuditor:
for platform, url in brave_social.items():
if platform not in website_social:
website_social[platform] = url
social_sources[platform] = 'brave_search'
logger.info(f"Added {platform} from Brave search: {url}")
except Exception as e:
logger.warning(f"Brave search failed: {str(e)}")
result['errors'].append(f'Brave search failed: {str(e)}')
result['social_media'] = website_social
result['social_sources'] = social_sources
logger.info(f"Total social media profiles found: {len(website_social)} - {list(website_social.keys())}")
# OAuth: Try Facebook/Instagram Graph API for authenticated data
@ -1443,6 +1571,7 @@ class SocialMediaAuditor:
})
# Save social media with enriched data
social_sources = result.get('social_sources', {})
for platform, url in result.get('social_media', {}).items():
normalized_url = normalize_social_url(url, platform)
@ -1489,7 +1618,7 @@ class SocialMediaAuditor:
'platform': platform,
'url': normalized_url,
'verified_at': result['audit_date'],
'source': 'website_scrape',
'source': social_sources.get(platform, 'website_scrape'),
'is_valid': True,
'page_name': enriched.get('page_name'),
'followers_count': enriched.get('followers_count'),