fix(social-audit): Fix Facebook URL truncation and improve scraping patterns
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
- Add regex pattern for Facebook /p/PageName-ID/ multi-segment URLs - Add 'p' to Facebook exclusion list (bare /p is always truncated) - Add minimum length validation for extracted social handles - Strip Instagram tracking params (?igsh=, &utm_source=) from handles Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
2e6eca55e7
commit
60d28a5c24
@ -116,6 +116,8 @@ HOSTING_PROVIDERS = {
|
||||
SOCIAL_MEDIA_PATTERNS = {
|
||||
'facebook': [
|
||||
r'(?:https?://)?(?:www\.)?facebook\.com/profile\.php\?id=(\d+)',
|
||||
# Multi-segment paths like /p/PageName-12345/ - capture full path
|
||||
r'(?:https?://)?(?:www\.)?facebook\.com/(p/[^/?\s"\'<>]+)',
|
||||
r'(?:https?://)?(?:www\.)?facebook\.com/([^/?\s"\'<>]+)',
|
||||
r'(?:https?://)?(?:www\.)?fb\.com/([^/?\s"\'<>]+)',
|
||||
],
|
||||
@ -140,7 +142,7 @@ SOCIAL_MEDIA_PATTERNS = {
|
||||
|
||||
# False positives to exclude
|
||||
SOCIAL_MEDIA_EXCLUDE = {
|
||||
'facebook': ['sharer', 'share', 'intent', 'plugins', 'dialog', 'sharer.php', 'login', 'pages', 'boldthemes', 'profile.php', 'profile', 'watch', 'groups', 'events', 'marketplace', 'gaming', 'stories'],
|
||||
'facebook': ['sharer', 'share', 'intent', 'plugins', 'dialog', 'sharer.php', 'login', 'pages', 'boldthemes', 'profile.php', 'profile', 'watch', 'groups', 'events', 'marketplace', 'gaming', 'stories', 'p', 'people', 'hashtag', 'help', 'settings', 'notifications'],
|
||||
'instagram': ['explore', 'accounts', 'p', 'reel'],
|
||||
'youtube': ['embed', 'watch', 'playlist', 'results', 'feed', 'channel', 'c', 'user', '@', 'about', 'featured', 'videos', 'shorts', 'streams', 'playlists', 'community', 'channels', 'store'],
|
||||
'linkedin': ['shareArticle', 'share', 'login'],
|
||||
@ -448,6 +450,9 @@ class WebsiteAuditor:
|
||||
if matches:
|
||||
# Get first valid match, excluding common false positives
|
||||
for match in matches:
|
||||
# Skip very short matches (likely truncated or generic paths)
|
||||
if len(match) < 2:
|
||||
continue
|
||||
# Check against exclusion list (exact match only to avoid false positives)
|
||||
excludes = SOCIAL_MEDIA_EXCLUDE.get(platform, [])
|
||||
if match.lower() not in excludes:
|
||||
@ -455,9 +460,17 @@ class WebsiteAuditor:
|
||||
if platform == 'facebook':
|
||||
if match.isdigit():
|
||||
url = f'https://facebook.com/profile.php?id={match}'
|
||||
elif '/' in match:
|
||||
# Multi-segment path (e.g. p/PageName-123)
|
||||
url = f'https://facebook.com/{match}'
|
||||
else:
|
||||
url = f'https://facebook.com/{match}'
|
||||
elif platform == 'instagram':
|
||||
# Skip Instagram handles with tracking params (igsh=, utm_)
|
||||
if '?' in match or '&' in match:
|
||||
match = match.split('?')[0].split('&')[0]
|
||||
if len(match) < 2:
|
||||
continue
|
||||
url = f'https://instagram.com/{match}'
|
||||
elif platform == 'youtube':
|
||||
if match.startswith('@'):
|
||||
|
||||
Loading…
Reference in New Issue
Block a user