diff --git a/zopk_content_scraper.py b/zopk_content_scraper.py index e31b4fe..9b464c9 100644 --- a/zopk_content_scraper.py +++ b/zopk_content_scraper.py @@ -140,12 +140,19 @@ ELEMENTS_TO_REMOVE = [ # Domains that are not scrapeable (paywalls, dynamic content, etc.) SKIP_DOMAINS = [ + # Social media 'facebook.com', 'twitter.com', 'x.com', 'linkedin.com', 'youtube.com', 'instagram.com', + # Paywalled news sites (require login, return cookie dialogs) + 'wyborcza.pl', # Gazeta Wyborcza paywall + 'rp.pl', # Rzeczpospolita paywall + # Aggregators (no original content) + 'wykop.pl', # Social news aggregator + 'reddit.com', ]