Fix YouTube detection overwriting valid matches
- Add 'channel', 'c', 'user', '@' etc. to YouTube exclusion list - Add 'bold_themes', 'boldthemes' to Twitter/Facebook exclusions (theme creators) - Fix pattern matching loop to stop after first valid match per platform - Prevents fallback pattern from overwriting correct channel ID with 'channel' Fixes issue where youtube.com/channel/ID was being overwritten with youtube.com/channel/channel by the second fallback pattern. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
c319777d58
commit
39cd257f4e
@ -117,12 +117,12 @@ SOCIAL_MEDIA_PATTERNS = {
|
|||||||
|
|
||||||
# False positives to exclude
|
# False positives to exclude
|
||||||
SOCIAL_MEDIA_EXCLUDE = {
|
SOCIAL_MEDIA_EXCLUDE = {
|
||||||
'facebook': ['sharer', 'share', 'intent', 'plugins', 'dialog', 'sharer.php', 'login', 'pages'],
|
'facebook': ['sharer', 'share', 'intent', 'plugins', 'dialog', 'sharer.php', 'login', 'pages', 'boldthemes'],
|
||||||
'instagram': ['explore', 'accounts', 'p', 'reel'],
|
'instagram': ['explore', 'accounts', 'p', 'reel'],
|
||||||
'youtube': ['embed', 'watch', 'playlist', 'results', 'feed'],
|
'youtube': ['embed', 'watch', 'playlist', 'results', 'feed', 'channel', 'c', 'user', '@', 'about', 'featured', 'videos', 'shorts', 'streams', 'playlists', 'community', 'channels', 'store'],
|
||||||
'linkedin': ['shareArticle', 'share', 'login'],
|
'linkedin': ['shareArticle', 'share', 'login'],
|
||||||
'tiktok': ['embed', 'video'],
|
'tiktok': ['embed', 'video'],
|
||||||
'twitter': ['intent', 'share', 'widgets.js', 'widgets', 'tweet', 'platform.twitter.com'],
|
'twitter': ['intent', 'share', 'widgets.js', 'widgets', 'tweet', 'platform.twitter.com', 'bold_themes', 'boldthemes'],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -417,7 +417,10 @@ class WebsiteAuditor:
|
|||||||
# Extract social media links
|
# Extract social media links
|
||||||
html_lower = html.lower()
|
html_lower = html.lower()
|
||||||
for platform, patterns in SOCIAL_MEDIA_PATTERNS.items():
|
for platform, patterns in SOCIAL_MEDIA_PATTERNS.items():
|
||||||
|
found_for_platform = False
|
||||||
for pattern in patterns:
|
for pattern in patterns:
|
||||||
|
if found_for_platform:
|
||||||
|
break # Already found this platform, skip remaining patterns
|
||||||
matches = re.findall(pattern, html, re.IGNORECASE)
|
matches = re.findall(pattern, html, re.IGNORECASE)
|
||||||
if matches:
|
if matches:
|
||||||
# Get first valid match, excluding common false positives
|
# Get first valid match, excluding common false positives
|
||||||
@ -445,7 +448,8 @@ class WebsiteAuditor:
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
result['social_media_links'][platform] = url
|
result['social_media_links'][platform] = url
|
||||||
break
|
found_for_platform = True
|
||||||
|
break # Found valid match, stop searching this pattern's matches
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
result['errors'] = [f'HTML parsing: {str(e)}']
|
result['errors'] = [f'HTML parsing: {str(e)}']
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user