fix(social-audit): Fix Facebook URL truncation and improve scraping patterns
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions

- Add regex pattern for Facebook /p/PageName-ID/ multi-segment URLs
- Add 'p' to Facebook exclusion list (bare /p is always truncated)
- Add minimum length validation for extracted social handles
- Strip Instagram tracking params (?igsh=, &utm_source=) from handles

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Maciej Pienczyn 2026-02-06 19:29:06 +01:00
parent 2e6eca55e7
commit 60d28a5c24

View File

@ -116,6 +116,8 @@ HOSTING_PROVIDERS = {
SOCIAL_MEDIA_PATTERNS = {
'facebook': [
r'(?:https?://)?(?:www\.)?facebook\.com/profile\.php\?id=(\d+)',
# Multi-segment paths like /p/PageName-12345/ - capture full path
r'(?:https?://)?(?:www\.)?facebook\.com/(p/[^/?\s"\'<>]+)',
r'(?:https?://)?(?:www\.)?facebook\.com/([^/?\s"\'<>]+)',
r'(?:https?://)?(?:www\.)?fb\.com/([^/?\s"\'<>]+)',
],
@ -140,7 +142,7 @@ SOCIAL_MEDIA_PATTERNS = {
# False positives to exclude
SOCIAL_MEDIA_EXCLUDE = {
'facebook': ['sharer', 'share', 'intent', 'plugins', 'dialog', 'sharer.php', 'login', 'pages', 'boldthemes', 'profile.php', 'profile', 'watch', 'groups', 'events', 'marketplace', 'gaming', 'stories'],
'facebook': ['sharer', 'share', 'intent', 'plugins', 'dialog', 'sharer.php', 'login', 'pages', 'boldthemes', 'profile.php', 'profile', 'watch', 'groups', 'events', 'marketplace', 'gaming', 'stories', 'p', 'people', 'hashtag', 'help', 'settings', 'notifications'],
'instagram': ['explore', 'accounts', 'p', 'reel'],
'youtube': ['embed', 'watch', 'playlist', 'results', 'feed', 'channel', 'c', 'user', '@', 'about', 'featured', 'videos', 'shorts', 'streams', 'playlists', 'community', 'channels', 'store'],
'linkedin': ['shareArticle', 'share', 'login'],
@ -448,6 +450,9 @@ class WebsiteAuditor:
if matches:
# Get first valid match, excluding common false positives
for match in matches:
# Skip very short matches (likely truncated or generic paths)
if len(match) < 2:
continue
# Check against exclusion list (exact match only to avoid false positives)
excludes = SOCIAL_MEDIA_EXCLUDE.get(platform, [])
if match.lower() not in excludes:
@ -455,9 +460,17 @@ class WebsiteAuditor:
if platform == 'facebook':
if match.isdigit():
url = f'https://facebook.com/profile.php?id={match}'
elif '/' in match:
# Multi-segment path (e.g. p/PageName-123)
url = f'https://facebook.com/{match}'
else:
url = f'https://facebook.com/{match}'
elif platform == 'instagram':
# Skip Instagram handles with tracking params (igsh=, utm_)
if '?' in match or '&' in match:
match = match.split('?')[0].split('&')[0]
if len(match) < 2:
continue
url = f'https://instagram.com/{match}'
elif platform == 'youtube':
if match.startswith('@'):