fix(nordagpt): smarter company validator — fix slugs instead of removing real companies
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions

- AI generates "inpi-sp-z-o-o" but real slug is "inpi" → now auto-corrected
- Fuzzy prefix matching on slugs (handles legal form suffixes)
- Name-based resolution as fallback (match link text to company name)
- Hallucinated companies: keep text, remove link (instead of deleting entirely)
- Better cleanup of artifacts ("oraz –", empty bullets)

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Maciej Pienczyn 2026-03-28 06:30:20 +01:00
parent c167794bb6
commit 464e456939

View File

@ -145,43 +145,79 @@ class NordaBizChatEngine:
finally:
db.close()
@staticmethod
def _find_correct_slug(attempted_slug: str, valid_slugs: set, name_to_slug: dict) -> Optional[str]:
"""Try to find the correct slug for a hallucinated one."""
# Direct match
if attempted_slug in valid_slugs:
return attempted_slug
# AI often appends legal form to slug: "inpi-sp-z-o-o" instead of "inpi"
# Try prefix match: if any valid slug is a prefix of the attempted slug
for vs in valid_slugs:
if attempted_slug.startswith(vs + '-') or attempted_slug == vs:
return vs
# Try if attempted slug is a prefix of a valid slug
for vs in valid_slugs:
if vs.startswith(attempted_slug + '-') or vs.startswith(attempted_slug):
return vs
return None
@staticmethod
def _validate_company_references(text: str) -> str:
"""
Post-process AI response: remove links to companies that don't exist in DB.
Post-process AI response: fix or remove links to companies that don't exist in DB.
This is the ONLY reliable way to prevent hallucinated company names.
"""
import re
valid_companies = NordaBizChatEngine._get_valid_company_slugs()
valid_slugs = set(valid_companies.keys())
valid_names_lower = {name.lower(): name for name in valid_companies.values()}
# Map: lowercase name → slug
name_to_slug = {}
for slug, name in valid_companies.items():
name_to_slug[name.lower()] = slug
# 1. Validate markdown links to /firma/slug — remove if slug doesn't exist
# 1. Validate markdown links to /firma/slug — fix or remove
def replace_link(match):
link_text = match.group(1)
slug = match.group(2)
# Try exact match first
if slug in valid_slugs:
return match.group(0) # Keep valid link
else:
logger.warning(f"NordaGPT hallucination blocked: removed link to non-existent company slug '{slug}' (text: '{link_text}')")
return '' # Remove entire link
return match.group(0)
# Try fuzzy slug match (AI often adds legal suffix)
correct_slug = NordaBizChatEngine._find_correct_slug(slug, valid_slugs, name_to_slug)
if correct_slug:
logger.info(f"NordaGPT slug corrected: '{slug}''{correct_slug}'")
return f'[{link_text}](/firma/{correct_slug})'
# Try matching by link text (company name)
name_slug = name_to_slug.get(link_text.lower().strip())
if name_slug:
logger.info(f"NordaGPT slug resolved by name: '{link_text}''{name_slug}'")
return f'[{link_text}](/firma/{name_slug})'
# No match — hallucination, keep just the text without link
logger.warning(f"NordaGPT hallucination blocked: '{link_text}' (slug: '{slug}') not in DB")
return link_text # Keep text, remove link
text = re.sub(r'\[([^\]]+)\]\(/firma/([a-z0-9-]+)\)', replace_link, text)
# 2. Validate pill-style links that the frontend renders
# 2. Validate pill-style links
def replace_pill_link(match):
full_match = match.group(0)
slug = match.group(1)
link_text = match.group(2) if match.lastindex >= 2 else slug
if slug in valid_slugs:
return full_match
else:
logger.warning(f"NordaGPT hallucination blocked: removed pill link to '{slug}'")
return ''
correct_slug = NordaBizChatEngine._find_correct_slug(slug, valid_slugs, name_to_slug)
if correct_slug:
return full_match.replace(f'/firma/{slug}', f'/firma/{correct_slug}')
logger.warning(f"NordaGPT hallucination blocked: pill link '{slug}' not in DB")
return link_text
text = re.sub(r'<a[^>]*href=["\']/firma/([a-z0-9-]+)["\'][^>]*>.*?</a>', replace_pill_link, text)
text = re.sub(r'<a[^>]*href=["\']/firma/([a-z0-9-]+)["\'][^>]*>(.*?)</a>', replace_pill_link, text)
# 3. Clean up empty list items and double spaces left by removals
# 3. Clean up artifacts left by removals
text = re.sub(r'\*\s*\s*\n', '\n', text) # "* " (bullet with removed company)
text = re.sub(r'\*\s*oraz\s*', '*', text) # "* oraz " fragments
text = re.sub(r'\n\s*\*\s*\n', '\n', text) # empty bullet points
text = re.sub(r'\n\s*-\s*\n', '\n', text) # empty list items
text = re.sub(r' +', ' ', text) # double spaces