From 18f9f98f5db057418fd705462022cecba8254b3b Mon Sep 17 00:00:00 2001
From: Maciej Pienczyn <maciej.pienczyn@inpi.pl>
Date: Mon, 9 Feb 2026 15:50:21 +0100
Subject: [PATCH] fix(zopk): Raise minimum scraped content threshold from 100
 to 500 chars
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Articles with only 100-458 chars were passing validation but contained
metadata/teasers instead of full article text, causing all knowledge
extraction to fail ("Treść za krótka do ekstrakcji"). The 500-char
minimum better aligns with the 200-token chunking requirement (~800 chars).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 zopk_content_scraper.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/zopk_content_scraper.py b/zopk_content_scraper.py
index 90d8fcd..bed4ea3 100644
--- a/zopk_content_scraper.py
+++ b/zopk_content_scraper.py
@@ -524,8 +524,8 @@ class ZOPKContentScraper:
             # Extract text
             text = self._extract_text(content_element)
 
-            if not text or len(text) < 100:
-                return None, "Treść artykułu za krótka"
+            if not text or len(text) < 500:
+                return None, f"Treść artykułu za krótka ({len(text) if text else 0} znaków, min. 500)"
 
             # Truncate if too long
             if len(text) > MAX_CONTENT_LENGTH: