fix(scraper): Dodano domeny paywall do SKIP_DOMAINS
- wyborcza.pl - paywall Gazety Wyborczej - rp.pl - paywall Rzeczpospolitej - wykop.pl - agregator bez oryginalnej treści - reddit.com - agregator Te domeny zwracają cookie dialog zamiast treści artykułów Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
21586f6b91
commit
1e42c4fbd8
@ -140,12 +140,19 @@ ELEMENTS_TO_REMOVE = [
|
||||
|
||||
# Domains that are not scrapeable (paywalls, dynamic content, etc.)
|
||||
SKIP_DOMAINS = [
|
||||
# Social media
|
||||
'facebook.com',
|
||||
'twitter.com',
|
||||
'x.com',
|
||||
'linkedin.com',
|
||||
'youtube.com',
|
||||
'instagram.com',
|
||||
# Paywalled news sites (require login, return cookie dialogs)
|
||||
'wyborcza.pl', # Gazeta Wyborcza paywall
|
||||
'rp.pl', # Rzeczpospolita paywall
|
||||
# Aggregators (no original content)
|
||||
'wykop.pl', # Social news aggregator
|
||||
'reddit.com',
|
||||
]
|
||||
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user