From 2c9a45230d7a51cff159eae06fb94bac23390401 Mon Sep 17 00:00:00 2001 From: Maciej Pienczyn Date: Thu, 12 Mar 2026 13:41:50 +0100 Subject: [PATCH] feat: LinkedIn scraper retry with random delays + authwall detection 3 attempts with 2-5s random delay between retries. Detects authwall and rate limit (429/999) responses. Updated status message to explain LinkedIn's inconsistent availability to users. Co-Authored-By: Claude Opus 4.6 --- blueprints/admin/routes_social.py | 2 +- scripts/social_media_audit.py | 65 +++++++++++++++++++++---------- 2 files changed, 45 insertions(+), 22 deletions(-) diff --git a/blueprints/admin/routes_social.py b/blueprints/admin/routes_social.py index c662519..38c48c6 100644 --- a/blueprints/admin/routes_social.py +++ b/blueprints/admin/routes_social.py @@ -992,7 +992,7 @@ def _run_enrichment_background(company_ids): elif platform_name == 'instagram': profile_result['reason'] = 'Instagram wymaga logowania. Podłącz Meta API (OAuth), aby pobierać dane.' elif platform_name == 'linkedin': - profile_result['reason'] = 'LinkedIn blokuje dostęp publiczny dla botów.' + profile_result['reason'] = 'LinkedIn blokuje boty (3 próby z opóźnieniem). Wyniki mogą się różnić między skanami.' else: profile_result['reason'] = f'{profile.platform} — brak danych publicznych do pobrania.' diff --git a/scripts/social_media_audit.py b/scripts/social_media_audit.py index 0b4fc3e..7feaa3f 100644 --- a/scripts/social_media_audit.py +++ b/scripts/social_media_audit.py @@ -1201,28 +1201,51 @@ class SocialProfileEnricher: return result def _enrich_linkedin(self, url: str) -> Dict[str, Any]: - """Enrich LinkedIn company page data.""" + """Enrich LinkedIn company page data. + + LinkedIn aggressively blocks bots — retries with random delays + to improve success rate. Returns empty dict if all attempts fail. + """ + import random + result = {} - try: - resp = self.session.get(url, timeout=REQUEST_TIMEOUT) - if resp.status_code == 200: - html = resp.text - og_desc = re.search(r' 0: + delay = random.uniform(2, 5) + time.sleep(delay) + resp = self.session.get(url, timeout=REQUEST_TIMEOUT) + if resp.status_code == 200: + html = resp.text + # Check if LinkedIn returned a login wall instead of data + if 'authwall' in html[:2000].lower() or 'sign in' in html[:2000].lower(): + logger.debug(f"LinkedIn authwall on attempt {attempt+1} for {url}") + continue + og_desc = re.search(r' Dict[str, Any]: