feat: LinkedIn scraper retry with random delays + authwall detection
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions

3 attempts with 2-5s random delay between retries. Detects authwall
and rate limit (429/999) responses. Updated status message to explain
LinkedIn's inconsistent availability to users.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Maciej Pienczyn 2026-03-12 13:41:50 +01:00
parent 5505560445
commit 2c9a45230d
2 changed files with 45 additions and 22 deletions

View File

@ -992,7 +992,7 @@ def _run_enrichment_background(company_ids):
elif platform_name == 'instagram':
profile_result['reason'] = 'Instagram wymaga logowania. Podłącz Meta API (OAuth), aby pobierać dane.'
elif platform_name == 'linkedin':
profile_result['reason'] = 'LinkedIn blokuje dostęp publiczny dla botów.'
profile_result['reason'] = 'LinkedIn blokuje boty (3 próby z opóźnieniem). Wyniki mogą się różnić między skanami.'
else:
profile_result['reason'] = f'{profile.platform} — brak danych publicznych do pobrania.'

View File

@ -1201,17 +1201,32 @@ class SocialProfileEnricher:
return result
def _enrich_linkedin(self, url: str) -> Dict[str, Any]:
"""Enrich LinkedIn company page data."""
"""Enrich LinkedIn company page data.
LinkedIn aggressively blocks bots retries with random delays
to improve success rate. Returns empty dict if all attempts fail.
"""
import random
result = {}
max_retries = 3
for attempt in range(max_retries):
try:
if attempt > 0:
delay = random.uniform(2, 5)
time.sleep(delay)
resp = self.session.get(url, timeout=REQUEST_TIMEOUT)
if resp.status_code == 200:
html = resp.text
# Check if LinkedIn returned a login wall instead of data
if 'authwall' in html[:2000].lower() or 'sign in' in html[:2000].lower():
logger.debug(f"LinkedIn authwall on attempt {attempt+1} for {url}")
continue
og_desc = re.search(r'<meta\s+(?:property|name)="og:description"\s+content="([^"]+)"', html)
if og_desc:
desc = og_desc.group(1).strip()
# LinkedIn descriptions often have follower count
followers_match = re.search(r'([\d,\.]+)\s+followers', desc, re.IGNORECASE)
followers_match = re.search(r'([\d,\.]+)\s+(?:followers|obserwujących)', desc, re.IGNORECASE)
if followers_match:
result['followers_count'] = self._parse_count(followers_match.group(1))
result['profile_description'] = desc[:500]
@ -1221,8 +1236,16 @@ class SocialProfileEnricher:
name_match = re.search(r'<meta\s+(?:property|name)="og:title"\s+content="([^"]+)"', html)
if name_match:
result['page_name'] = name_match.group(1)
if result:
break # Got data, no need to retry
elif resp.status_code in (429, 999):
logger.debug(f"LinkedIn rate-limited ({resp.status_code}) attempt {attempt+1} for {url}")
continue
else:
logger.debug(f"LinkedIn HTTP {resp.status_code} attempt {attempt+1} for {url}")
break # Non-retryable status
except Exception as e:
logger.debug(f"LinkedIn enrichment failed: {e}")
logger.debug(f"LinkedIn enrichment attempt {attempt+1} failed: {e}")
return result
def _enrich_tiktok(self, url: str) -> Dict[str, Any]: