From 18f9f98f5db057418fd705462022cecba8254b3b Mon Sep 17 00:00:00 2001 From: Maciej Pienczyn Date: Mon, 9 Feb 2026 15:50:21 +0100 Subject: [PATCH] fix(zopk): Raise minimum scraped content threshold from 100 to 500 chars MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Articles with only 100-458 chars were passing validation but contained metadata/teasers instead of full article text, causing all knowledge extraction to fail ("Treść za krótka do ekstrakcji"). The 500-char minimum better aligns with the 200-token chunking requirement (~800 chars). Co-Authored-By: Claude Opus 4.6 --- zopk_content_scraper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/zopk_content_scraper.py b/zopk_content_scraper.py index 90d8fcd..bed4ea3 100644 --- a/zopk_content_scraper.py +++ b/zopk_content_scraper.py @@ -524,8 +524,8 @@ class ZOPKContentScraper: # Extract text text = self._extract_text(content_element) - if not text or len(text) < 100: - return None, "Treść artykułu za krótka" + if not text or len(text) < 500: + return None, f"Treść artykułu za krótka ({len(text) if text else 0} znaków, min. 500)" # Truncate if too long if len(text) > MAX_CONTENT_LENGTH: