fix(nordagpt): smarter company validator — fix slugs instead of removing real companies
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
- AI generates "inpi-sp-z-o-o" but real slug is "inpi" → now auto-corrected
- Fuzzy prefix matching on slugs (handles legal form suffixes)
- Name-based resolution as fallback (match link text to company name)
- Hallucinated companies: keep text, remove link (instead of deleting entirely)
- Better cleanup of artifacts ("oraz –", empty bullets)
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
c167794bb6
commit
464e456939
@ -145,43 +145,79 @@ class NordaBizChatEngine:
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
@staticmethod
|
||||
def _find_correct_slug(attempted_slug: str, valid_slugs: set, name_to_slug: dict) -> Optional[str]:
|
||||
"""Try to find the correct slug for a hallucinated one."""
|
||||
# Direct match
|
||||
if attempted_slug in valid_slugs:
|
||||
return attempted_slug
|
||||
# AI often appends legal form to slug: "inpi-sp-z-o-o" instead of "inpi"
|
||||
# Try prefix match: if any valid slug is a prefix of the attempted slug
|
||||
for vs in valid_slugs:
|
||||
if attempted_slug.startswith(vs + '-') or attempted_slug == vs:
|
||||
return vs
|
||||
# Try if attempted slug is a prefix of a valid slug
|
||||
for vs in valid_slugs:
|
||||
if vs.startswith(attempted_slug + '-') or vs.startswith(attempted_slug):
|
||||
return vs
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _validate_company_references(text: str) -> str:
|
||||
"""
|
||||
Post-process AI response: remove links to companies that don't exist in DB.
|
||||
Post-process AI response: fix or remove links to companies that don't exist in DB.
|
||||
This is the ONLY reliable way to prevent hallucinated company names.
|
||||
"""
|
||||
import re
|
||||
|
||||
valid_companies = NordaBizChatEngine._get_valid_company_slugs()
|
||||
valid_slugs = set(valid_companies.keys())
|
||||
valid_names_lower = {name.lower(): name for name in valid_companies.values()}
|
||||
# Map: lowercase name → slug
|
||||
name_to_slug = {}
|
||||
for slug, name in valid_companies.items():
|
||||
name_to_slug[name.lower()] = slug
|
||||
|
||||
# 1. Validate markdown links to /firma/slug — remove if slug doesn't exist
|
||||
# 1. Validate markdown links to /firma/slug — fix or remove
|
||||
def replace_link(match):
|
||||
link_text = match.group(1)
|
||||
slug = match.group(2)
|
||||
# Try exact match first
|
||||
if slug in valid_slugs:
|
||||
return match.group(0) # Keep valid link
|
||||
else:
|
||||
logger.warning(f"NordaGPT hallucination blocked: removed link to non-existent company slug '{slug}' (text: '{link_text}')")
|
||||
return '' # Remove entire link
|
||||
return match.group(0)
|
||||
# Try fuzzy slug match (AI often adds legal suffix)
|
||||
correct_slug = NordaBizChatEngine._find_correct_slug(slug, valid_slugs, name_to_slug)
|
||||
if correct_slug:
|
||||
logger.info(f"NordaGPT slug corrected: '{slug}' → '{correct_slug}'")
|
||||
return f'[{link_text}](/firma/{correct_slug})'
|
||||
# Try matching by link text (company name)
|
||||
name_slug = name_to_slug.get(link_text.lower().strip())
|
||||
if name_slug:
|
||||
logger.info(f"NordaGPT slug resolved by name: '{link_text}' → '{name_slug}'")
|
||||
return f'[{link_text}](/firma/{name_slug})'
|
||||
# No match — hallucination, keep just the text without link
|
||||
logger.warning(f"NordaGPT hallucination blocked: '{link_text}' (slug: '{slug}') not in DB")
|
||||
return link_text # Keep text, remove link
|
||||
|
||||
text = re.sub(r'\[([^\]]+)\]\(/firma/([a-z0-9-]+)\)', replace_link, text)
|
||||
|
||||
# 2. Validate pill-style links that the frontend renders
|
||||
# 2. Validate pill-style links
|
||||
def replace_pill_link(match):
|
||||
full_match = match.group(0)
|
||||
slug = match.group(1)
|
||||
link_text = match.group(2) if match.lastindex >= 2 else slug
|
||||
if slug in valid_slugs:
|
||||
return full_match
|
||||
else:
|
||||
logger.warning(f"NordaGPT hallucination blocked: removed pill link to '{slug}'")
|
||||
return ''
|
||||
correct_slug = NordaBizChatEngine._find_correct_slug(slug, valid_slugs, name_to_slug)
|
||||
if correct_slug:
|
||||
return full_match.replace(f'/firma/{slug}', f'/firma/{correct_slug}')
|
||||
logger.warning(f"NordaGPT hallucination blocked: pill link '{slug}' not in DB")
|
||||
return link_text
|
||||
|
||||
text = re.sub(r'<a[^>]*href=["\']/firma/([a-z0-9-]+)["\'][^>]*>.*?</a>', replace_pill_link, text)
|
||||
text = re.sub(r'<a[^>]*href=["\']/firma/([a-z0-9-]+)["\'][^>]*>(.*?)</a>', replace_pill_link, text)
|
||||
|
||||
# 3. Clean up empty list items and double spaces left by removals
|
||||
# 3. Clean up artifacts left by removals
|
||||
text = re.sub(r'\*\s*–\s*\n', '\n', text) # "* – " (bullet with removed company)
|
||||
text = re.sub(r'\*\s*oraz\s*–', '*', text) # "* oraz –" fragments
|
||||
text = re.sub(r'\n\s*\*\s*\n', '\n', text) # empty bullet points
|
||||
text = re.sub(r'\n\s*-\s*\n', '\n', text) # empty list items
|
||||
text = re.sub(r' +', ' ', text) # double spaces
|
||||
|
||||
Loading…
Reference in New Issue
Block a user