Fix YouTube detection overwriting valid matches

- Add 'channel', 'c', 'user', '@' etc. to YouTube exclusion list
- Add 'bold_themes', 'boldthemes' to Twitter/Facebook exclusions (theme creators)
- Fix pattern matching loop to stop after first valid match per platform
- Prevents fallback pattern from overwriting correct channel ID with 'channel'

Fixes issue where youtube.com/channel/ID was being overwritten with
youtube.com/channel/channel by the second fallback pattern.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Maciej Pienczyn 2026-01-09 05:36:06 +01:00
parent c319777d58
commit 39cd257f4e

View File

@ -117,12 +117,12 @@ SOCIAL_MEDIA_PATTERNS = {
# False positives to exclude
SOCIAL_MEDIA_EXCLUDE = {
'facebook': ['sharer', 'share', 'intent', 'plugins', 'dialog', 'sharer.php', 'login', 'pages'],
'facebook': ['sharer', 'share', 'intent', 'plugins', 'dialog', 'sharer.php', 'login', 'pages', 'boldthemes'],
'instagram': ['explore', 'accounts', 'p', 'reel'],
'youtube': ['embed', 'watch', 'playlist', 'results', 'feed'],
'youtube': ['embed', 'watch', 'playlist', 'results', 'feed', 'channel', 'c', 'user', '@', 'about', 'featured', 'videos', 'shorts', 'streams', 'playlists', 'community', 'channels', 'store'],
'linkedin': ['shareArticle', 'share', 'login'],
'tiktok': ['embed', 'video'],
'twitter': ['intent', 'share', 'widgets.js', 'widgets', 'tweet', 'platform.twitter.com'],
'twitter': ['intent', 'share', 'widgets.js', 'widgets', 'tweet', 'platform.twitter.com', 'bold_themes', 'boldthemes'],
}
@ -417,7 +417,10 @@ class WebsiteAuditor:
# Extract social media links
html_lower = html.lower()
for platform, patterns in SOCIAL_MEDIA_PATTERNS.items():
found_for_platform = False
for pattern in patterns:
if found_for_platform:
break # Already found this platform, skip remaining patterns
matches = re.findall(pattern, html, re.IGNORECASE)
if matches:
# Get first valid match, excluding common false positives
@ -445,7 +448,8 @@ class WebsiteAuditor:
continue
result['social_media_links'][platform] = url
break
found_for_platform = True
break # Found valid match, stop searching this pattern's matches
except Exception as e:
result['errors'] = [f'HTML parsing: {str(e)}']