fix: Implement Brave Search for LinkedIn detection and fix URL construction

- Replace placeholder _search_brave() with real Brave API integration - Fix LinkedIn URL construction: /in/ profiles were incorrectly built as /company/ - Add word-boundary matching to validate search results against company name - Track source (website_scrape vs brave_search) per platform in audit results - Increase search results from 5 to 10 for better coverage Fixes: WATERM LinkedIn profile not detected (website has no LinkedIn link, but Brave Search finds the personal /in/ profile) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-17 10:52:09 +01:00 · 2026-02-17 10:52:09 +01:00 · 6633b94644
commit 6633b94644
parent d3b59b824e
1 changed files with 141 additions and 12 deletions
--- a/scripts/social_media_audit.py
+++ b/scripts/social_media_audit.py
@ -129,8 +129,8 @@ SOCIAL_MEDIA_PATTERNS = {
        r'(?:https?://)?(?:www\.)?youtube\.com/([^/?\s"\'<>]+)',
    ],
    'linkedin': [
-        r'(?:https?://)?(?:www\.|pl\.)?linkedin\.com/company/([^/?\s"\'<>]+)',
-        r'(?:https?://)?(?:www\.|pl\.)?linkedin\.com/in/([^/?\s"\'<>]+)',
+        r'(?:https?://)?(?:www\.|pl\.)?linkedin\.com/(company/[^/?\s"\'<>]+)',
+        r'(?:https?://)?(?:www\.|pl\.)?linkedin\.com/(in/[^/?\s"\'<>]+)',
    ],
    'tiktok': [
        r'(?:https?://)?(?:www\.)?tiktok\.com/@([^/?\s"\'<>]+)',
@ -145,7 +145,7 @@ SOCIAL_MEDIA_EXCLUDE = {
    'facebook': ['sharer', 'share', 'intent', 'plugins', 'dialog', 'sharer.php', 'login', 'pages', 'boldthemes', 'profile.php', 'profile', 'watch', 'groups', 'events', 'marketplace', 'gaming', 'stories', 'p', 'people', 'hashtag', 'help', 'settings', 'notifications', 'tr', 'privacy', 'policies', 'ads', 'business', 'legal', 'flx'],
    'instagram': ['explore', 'accounts', 'p', 'reel'],
    'youtube': ['embed', 'watch', 'playlist', 'results', 'feed', 'channel', 'c', 'user', '@', 'about', 'featured', 'videos', 'shorts', 'streams', 'playlists', 'community', 'channels', 'store'],
-    'linkedin': ['shareArticle', 'share', 'login'],
+    'linkedin': ['company/shareArticle', 'company/share', 'company/login', 'in/shareArticle', 'in/share', 'in/login'],
    'tiktok': ['embed', 'video'],
    'twitter': ['intent', 'share', 'widgets.js', 'widgets', 'tweet', 'platform.twitter.com', 'bold_themes', 'boldthemes'],
 }
@ -478,7 +478,7 @@ class WebsiteAuditor:
                                    else:
                                        url = f'https://youtube.com/channel/{match}'
                                elif platform == 'linkedin':
-                                    url = f'https://linkedin.com/company/{match}'
+                                    url = f'https://linkedin.com/{match}'
                                elif platform == 'tiktok':
                                    url = f'https://tiktok.com/@{match}'
                                elif platform == 'twitter':
@ -729,7 +729,7 @@ class BraveSearcher:

        for platform, query in platforms:
            try:
-                url = self._search_brave(query, platform)
+                url = self._search_brave(query, platform, company_name)
                if url:
                    results[platform] = url
                time.sleep(0.5)  # Rate limiting
@ -884,14 +884,137 @@ class BraveSearcher:
            logger.warning(f"Error parsing Brave results for '{company_name}': {e}")
            return None

-    def _search_brave(self, query: str, platform: str) -> Optional[str]:
+    def _search_brave(self, query: str, platform: str, company_name: str = '') -> Optional[str]:
        """
-        Perform Brave search and extract relevant URL.
-        Note: This is a placeholder - actual implementation would use Brave API.
+        Perform Brave search and extract relevant social media URL.
+        Validates results against company_name to avoid false matches.
+        Returns normalized URL for the platform or None.
        """
-        # Placeholder for Brave Search API integration
-        # In production, this would call the Brave Search API
-        return None
+        if not self.api_key:
+            logger.debug(f"No Brave API key - skipping search for {platform}")
+            return None
+
+        try:
+            url = 'https://api.search.brave.com/res/v1/web/search'
+            headers = {
+                'Accept': 'application/json',
+                'Accept-Encoding': 'gzip',
+                'X-Subscription-Token': self.api_key,
+            }
+            params = {
+                'q': query,
+                'count': 10,
+                'country': 'pl',
+                'search_lang': 'pl',
+                'ui_lang': 'pl-PL',
+            }
+
+            response = self.session.get(url, headers=headers, params=params, timeout=REQUEST_TIMEOUT)
+            response.raise_for_status()
+
+            data = response.json()
+            results = data.get('web', {}).get('results', [])
+
+            # Platform domain patterns
+            domain_patterns = {
+                'facebook': r'facebook\.com/',
+                'instagram': r'instagram\.com/',
+                'youtube': r'youtube\.com/',
+                'linkedin': r'linkedin\.com/(?:company|in)/',
+                'tiktok': r'tiktok\.com/@',
+                'twitter': r'(?:twitter|x)\.com/',
+            }
+
+            pattern = domain_patterns.get(platform)
+            if not pattern:
+                return None
+
+            # Prepare company name variations for matching
+            name_lower = company_name.lower().strip()
+            # Generate matching tokens with word boundary patterns
+            # (e.g. "Waterm Artur Wiertel" -> [r'\bwaterm\b', r'\bartur\b', r'\bwiertel\b'])
+            name_tokens = [re.compile(r'\b' + re.escape(t) + r'\b', re.IGNORECASE)
+                           for t in name_lower.split() if len(t) >= 3]
+
+            candidates = []
+            for result in results:
+                result_url = result.get('url', '')
+                result_title = result.get('title', '')
+                result_desc = result.get('description', '')
+
+                if not re.search(pattern, result_url, re.IGNORECASE):
+                    continue
+
+                # Validate it's a real profile, not a search/share page
+                excludes = SOCIAL_MEDIA_EXCLUDE.get(platform, [])
+                is_excluded = any(ex.lower() in result_url.lower() for ex in excludes)
+                if is_excluded:
+                    continue
+
+                # Check if result relates to the company
+                searchable = f'{result_title} {result_desc} {result_url}'.lower()
+                # Count how many name tokens appear in the result (word boundary match)
+                token_matches = sum(1 for t in name_tokens if t.search(searchable))
+
+                if token_matches == 0:
+                    continue  # No connection to company at all
+
+                # Extract handle using platform patterns
+                extracted_url = None
+                for regex in SOCIAL_MEDIA_PATTERNS.get(platform, []):
+                    match = re.search(regex, result_url, re.IGNORECASE)
+                    if match:
+                        handle = match.group(1)
+                        if len(handle) >= 2:
+                            extracted_url = self._build_social_url(platform, handle)
+                            break
+
+                if not extracted_url:
+                    extracted_url = result_url
+
+                candidates.append((token_matches, extracted_url))
+
+            if candidates:
+                # Sort by number of token matches (best match first)
+                candidates.sort(key=lambda x: x[0], reverse=True)
+                best_url = candidates[0][1]
+                logger.info(f"Brave search matched {platform}: {best_url} (score: {candidates[0][0]}/{len(name_tokens)})")
+                return best_url
+
+            logger.debug(f"No {platform} profile found in Brave results for: {query}")
+            return None
+
+        except requests.exceptions.Timeout:
+            logger.warning(f"Timeout searching Brave for '{query}'")
+            return None
+        except requests.exceptions.RequestException as e:
+            logger.warning(f"Brave API request failed for '{query}': {e}")
+            return None
+        except Exception as e:
+            logger.warning(f"Error parsing Brave results for '{query}': {e}")
+            return None
+
+    @staticmethod
+    def _build_social_url(platform: str, handle: str) -> str:
+        """Build normalized social media URL from platform and handle."""
+        if platform == 'facebook':
+            if handle.isdigit():
+                return f'https://facebook.com/profile.php?id={handle}'
+            return f'https://facebook.com/{handle}'
+        elif platform == 'instagram':
+            handle = handle.split('?')[0].split('&')[0]
+            return f'https://instagram.com/{handle}'
+        elif platform == 'youtube':
+            if handle.startswith('@'):
+                return f'https://youtube.com/{handle}'
+            return f'https://youtube.com/channel/{handle}'
+        elif platform == 'linkedin':
+            return f'https://linkedin.com/{handle}'
+        elif platform == 'tiktok':
+            return f'https://tiktok.com/@{handle}'
+        elif platform == 'twitter':
+            return f'https://twitter.com/{handle}'
+        return handle


 class SocialProfileEnricher:
@ -1212,8 +1335,11 @@ class SocialMediaAuditor:

        # 2. Social media from website
        website_social = result['website'].get('social_media_links', {})
+        social_sources = {}  # Track source per platform
        if website_social:
            logger.info(f"Social media found on website: {list(website_social.keys())}")
+            for p in website_social:
+                social_sources[p] = 'website_scrape'
        else:
            logger.info("No social media links found on website")

@ -1230,12 +1356,14 @@ class SocialMediaAuditor:
            for platform, url in brave_social.items():
                if platform not in website_social:
                    website_social[platform] = url
+                    social_sources[platform] = 'brave_search'
                    logger.info(f"Added {platform} from Brave search: {url}")
        except Exception as e:
            logger.warning(f"Brave search failed: {str(e)}")
            result['errors'].append(f'Brave search failed: {str(e)}')

        result['social_media'] = website_social
+        result['social_sources'] = social_sources
        logger.info(f"Total social media profiles found: {len(website_social)} - {list(website_social.keys())}")

        # OAuth: Try Facebook/Instagram Graph API for authenticated data
@ -1443,6 +1571,7 @@ class SocialMediaAuditor:
                })

                # Save social media with enriched data
+                social_sources = result.get('social_sources', {})
                for platform, url in result.get('social_media', {}).items():
                    normalized_url = normalize_social_url(url, platform)

@ -1489,7 +1618,7 @@ class SocialMediaAuditor:
                        'platform': platform,
                        'url': normalized_url,
                        'verified_at': result['audit_date'],
-                        'source': 'website_scrape',
+                        'source': social_sources.get(platform, 'website_scrape'),
                        'is_valid': True,
                        'page_name': enriched.get('page_name'),
                        'followers_count': enriched.get('followers_count'),