fix(nordagpt): catch bullet-point company hallucinations (* ProBud to...)

AI writes hallucinated company names at start of bullet points without any prefix word. New pattern catches "* CompanyName to/–/specjalizuje" and removes the fake name if it's not in the database. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-28 06:58:53 +01:00 · 2026-03-28 06:58:53 +01:00 · 513d32ffb2
commit 513d32ffb2
parent a1a64730e3
1 changed files with 32 additions and 9 deletions
--- a/nordabiz_chat.py
+++ b/nordabiz_chat.py
@ -245,24 +245,47 @@ class NordaBizChatEngine:
        text = re.sub(r'\*\*([^*]{2,40})\*\*', replace_bold_company, text)
-        # 4. Remove plain-text company name mentions that aren't linked
+        # 4. Remove ALL plain-text company name mentions that aren't linked
-        # Catches: "firma Baumar", "również Pro-Bud", "firmy Baumar i Pro-Bud"
+        # Catches: "firma Baumar", "również Pro-Bud", "* Pro-Bud to..."
-        def replace_plain_company(match):
+        def check_company_name(name: str) -> bool:
-            prefix = match.group(1)  # "firma", "również", etc.
+            """Check if a name is a valid company."""
-            name = match.group(2).strip().rstrip('.,;:')
+            name_clean = name.strip().rstrip('.,;:')
-            if name.lower() in valid_names_set:
+            if name_clean.lower() in valid_names_set:
-                return match.group(0)  # Valid company
+                return True
            for vn in valid_names_set:
-                if name.lower() in vn or vn in name.lower():
+                if name_clean.lower() in vn or vn in name_clean.lower():
-                    return match.group(0)  # Partial match
+                    return True
            return False
        def replace_plain_company(match):
            prefix = match.group(1)
            name = match.group(2).strip().rstrip('.,;:')
            if check_company_name(name):
                return match.group(0)
            logger.warning(f"NordaGPT hallucination blocked: plain text '{name}' after '{prefix}' not in DB")
            return ''
        # Pattern 1: "firma X", "również X", "oraz X"
        text = re.sub(
            r'(firma|firmą|firmę|firmy|również|oraz)\s+([A-ZĄĘÓŁŹŻŚĆŃ][a-zA-ZąęółźżśćńĄĘÓŁŹŻŚĆŃ-]{2,25}(?:\s+[A-ZĄĘÓŁŹŻŚĆŃ][a-zA-ZąęółźżśćńĄĘÓŁŹŻŚĆŃ-]+)?)',
            replace_plain_company, text
        )
        # Pattern 2: "* CompanyName to/–/specjalizuje" at start of bullet point
        def replace_bullet_company(match):
            bullet = match.group(1)  # "* " or "- "
            name = match.group(2).strip()
            suffix = match.group(3)  # "to", "–", "specjalizuje" etc.
            if check_company_name(name):
                return match.group(0)
            logger.warning(f"NordaGPT hallucination blocked: bullet company '{name}' not in DB")
            return f'{bullet}{suffix}'  # Keep bullet and suffix, remove company name
        text = re.sub(
            r'(\*\s+)([A-ZĄĘÓŁŹŻŚĆŃ][a-zA-ZąęółźżśćńĄĘÓŁŹŻŚĆŃ-]{2,25}(?:\s+[A-ZĄĘÓŁŹŻŚĆŃ][a-zA-ZąęółźżśćńĄĘÓŁŹŻŚĆŃ-]+)?)\s+(to |– |specjali|oferuj|zajmuj|zapewni|posiada|świadcz)',
            replace_bullet_company, text
        )
        # 5. Clean up artifacts left by removals
        text = re.sub(r':\s*oraz\s*to\b', ': to', text)     # ": oraz to" → ": to"
        text = re.sub(r':\s*,', ':', text)                    # ": ," → ":"