From 513d32ffb255824a011df1fda0eb6f40871c3708 Mon Sep 17 00:00:00 2001
From: Maciej Pienczyn <maciej.pienczyn@inpi.pl>
Date: Sat, 28 Mar 2026 06:58:53 +0100
Subject: [PATCH] fix(nordagpt): catch bullet-point company hallucinations (*
 ProBud to...)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

AI writes hallucinated company names at start of bullet points without
any prefix word. New pattern catches "* CompanyName to/–/specjalizuje"
and removes the fake name if it's not in the database.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 nordabiz_chat.py | 41 ++++++++++++++++++++++++++++++++---------
 1 file changed, 32 insertions(+), 9 deletions(-)

diff --git a/nordabiz_chat.py b/nordabiz_chat.py
index 0bd8b90..6c726a4 100644
--- a/nordabiz_chat.py
+++ b/nordabiz_chat.py
@@ -245,24 +245,47 @@ class NordaBizChatEngine:
 
         text = re.sub(r'\*\*([^*]{2,40})\*\*', replace_bold_company, text)
 
-        # 4. Remove plain-text company name mentions that aren't linked
-        # Catches: "firma Baumar", "również Pro-Bud", "firmy Baumar i Pro-Bud"
-        def replace_plain_company(match):
-            prefix = match.group(1)  # "firma", "również", etc.
-            name = match.group(2).strip().rstrip('.,;:')
-            if name.lower() in valid_names_set:
-                return match.group(0)  # Valid company
+        # 4. Remove ALL plain-text company name mentions that aren't linked
+        # Catches: "firma Baumar", "również Pro-Bud", "* Pro-Bud to..."
+        def check_company_name(name: str) -> bool:
+            """Check if a name is a valid company."""
+            name_clean = name.strip().rstrip('.,;:')
+            if name_clean.lower() in valid_names_set:
+                return True
             for vn in valid_names_set:
-                if name.lower() in vn or vn in name.lower():
-                    return match.group(0)  # Partial match
+                if name_clean.lower() in vn or vn in name_clean.lower():
+                    return True
+            return False
+
+        def replace_plain_company(match):
+            prefix = match.group(1)
+            name = match.group(2).strip().rstrip('.,;:')
+            if check_company_name(name):
+                return match.group(0)
             logger.warning(f"NordaGPT hallucination blocked: plain text '{name}' after '{prefix}' not in DB")
             return ''
 
+        # Pattern 1: "firma X", "również X", "oraz X"
         text = re.sub(
             r'(firma|firmą|firmę|firmy|również|oraz)\s+([A-ZĄĘÓŁŹŻŚĆŃ][a-zA-ZąęółźżśćńĄĘÓŁŹŻŚĆŃ-]{2,25}(?:\s+[A-ZĄĘÓŁŹŻŚĆŃ][a-zA-ZąęółźżśćńĄĘÓŁŹŻŚĆŃ-]+)?)',
             replace_plain_company, text
         )
 
+        # Pattern 2: "* CompanyName to/–/specjalizuje" at start of bullet point
+        def replace_bullet_company(match):
+            bullet = match.group(1)  # "* " or "- "
+            name = match.group(2).strip()
+            suffix = match.group(3)  # "to", "–", "specjalizuje" etc.
+            if check_company_name(name):
+                return match.group(0)
+            logger.warning(f"NordaGPT hallucination blocked: bullet company '{name}' not in DB")
+            return f'{bullet}{suffix}'  # Keep bullet and suffix, remove company name
+
+        text = re.sub(
+            r'(\*\s+)([A-ZĄĘÓŁŹŻŚĆŃ][a-zA-ZąęółźżśćńĄĘÓŁŹŻŚĆŃ-]{2,25}(?:\s+[A-ZĄĘÓŁŹŻŚĆŃ][a-zA-ZąęółźżśćńĄĘÓŁŹŻŚĆŃ-]+)?)\s+(to |– |specjali|oferuj|zajmuj|zapewni|posiada|świadcz)',
+            replace_bullet_company, text
+        )
+
         # 5. Clean up artifacts left by removals
         text = re.sub(r':\s*oraz\s*to\b', ': to', text)     # ": oraz to" → ": to"
         text = re.sub(r':\s*,', ':', text)                    # ": ," → ":"