From 513d32ffb255824a011df1fda0eb6f40871c3708 Mon Sep 17 00:00:00 2001 From: Maciej Pienczyn Date: Sat, 28 Mar 2026 06:58:53 +0100 Subject: [PATCH] fix(nordagpt): catch bullet-point company hallucinations (* ProBud to...) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit AI writes hallucinated company names at start of bullet points without any prefix word. New pattern catches "* CompanyName to/–/specjalizuje" and removes the fake name if it's not in the database. Co-Authored-By: Claude Opus 4.6 (1M context) --- nordabiz_chat.py | 41 ++++++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/nordabiz_chat.py b/nordabiz_chat.py index 0bd8b90..6c726a4 100644 --- a/nordabiz_chat.py +++ b/nordabiz_chat.py @@ -245,24 +245,47 @@ class NordaBizChatEngine: text = re.sub(r'\*\*([^*]{2,40})\*\*', replace_bold_company, text) - # 4. Remove plain-text company name mentions that aren't linked - # Catches: "firma Baumar", "również Pro-Bud", "firmy Baumar i Pro-Bud" - def replace_plain_company(match): - prefix = match.group(1) # "firma", "również", etc. - name = match.group(2).strip().rstrip('.,;:') - if name.lower() in valid_names_set: - return match.group(0) # Valid company + # 4. Remove ALL plain-text company name mentions that aren't linked + # Catches: "firma Baumar", "również Pro-Bud", "* Pro-Bud to..." + def check_company_name(name: str) -> bool: + """Check if a name is a valid company.""" + name_clean = name.strip().rstrip('.,;:') + if name_clean.lower() in valid_names_set: + return True for vn in valid_names_set: - if name.lower() in vn or vn in name.lower(): - return match.group(0) # Partial match + if name_clean.lower() in vn or vn in name_clean.lower(): + return True + return False + + def replace_plain_company(match): + prefix = match.group(1) + name = match.group(2).strip().rstrip('.,;:') + if check_company_name(name): + return match.group(0) logger.warning(f"NordaGPT hallucination blocked: plain text '{name}' after '{prefix}' not in DB") return '' + # Pattern 1: "firma X", "również X", "oraz X" text = re.sub( r'(firma|firmą|firmę|firmy|również|oraz)\s+([A-ZĄĘÓŁŹŻŚĆŃ][a-zA-ZąęółźżśćńĄĘÓŁŹŻŚĆŃ-]{2,25}(?:\s+[A-ZĄĘÓŁŹŻŚĆŃ][a-zA-ZąęółźżśćńĄĘÓŁŹŻŚĆŃ-]+)?)', replace_plain_company, text ) + # Pattern 2: "* CompanyName to/–/specjalizuje" at start of bullet point + def replace_bullet_company(match): + bullet = match.group(1) # "* " or "- " + name = match.group(2).strip() + suffix = match.group(3) # "to", "–", "specjalizuje" etc. + if check_company_name(name): + return match.group(0) + logger.warning(f"NordaGPT hallucination blocked: bullet company '{name}' not in DB") + return f'{bullet}{suffix}' # Keep bullet and suffix, remove company name + + text = re.sub( + r'(\*\s+)([A-ZĄĘÓŁŹŻŚĆŃ][a-zA-ZąęółźżśćńĄĘÓŁŹŻŚĆŃ-]{2,25}(?:\s+[A-ZĄĘÓŁŹŻŚĆŃ][a-zA-ZąęółźżśćńĄĘÓŁŹŻŚĆŃ-]+)?)\s+(to |– |specjali|oferuj|zajmuj|zapewni|posiada|świadcz)', + replace_bullet_company, text + ) + # 5. Clean up artifacts left by removals text = re.sub(r':\s*oraz\s*to\b', ': to', text) # ": oraz to" → ": to" text = re.sub(r':\s*,', ':', text) # ": ," → ":"