fix(nordagpt): smarter company validator — fix slugs instead of removing real companies

- AI generates "inpi-sp-z-o-o" but real slug is "inpi" → now auto-corrected - Fuzzy prefix matching on slugs (handles legal form suffixes) - Name-based resolution as fallback (match link text to company name) - Hallucinated companies: keep text, remove link (instead of deleting entirely) - Better cleanup of artifacts ("oraz –", empty bullets) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-28 06:30:20 +01:00 · 2026-03-28 06:30:20 +01:00 · 464e456939
commit 464e456939
parent c167794bb6
1 changed files with 53 additions and 17 deletions
--- a/nordabiz_chat.py
+++ b/nordabiz_chat.py
@ -145,43 +145,79 @@ class NordaBizChatEngine:
        finally:
            db.close()

+    @staticmethod
+    def _find_correct_slug(attempted_slug: str, valid_slugs: set, name_to_slug: dict) -> Optional[str]:
+        """Try to find the correct slug for a hallucinated one."""
+        # Direct match
+        if attempted_slug in valid_slugs:
+            return attempted_slug
+        # AI often appends legal form to slug: "inpi-sp-z-o-o" instead of "inpi"
+        # Try prefix match: if any valid slug is a prefix of the attempted slug
+        for vs in valid_slugs:
+            if attempted_slug.startswith(vs + '-') or attempted_slug == vs:
+                return vs
+        # Try if attempted slug is a prefix of a valid slug
+        for vs in valid_slugs:
+            if vs.startswith(attempted_slug + '-') or vs.startswith(attempted_slug):
+                return vs
+        return None
+
    @staticmethod
    def _validate_company_references(text: str) -> str:
        """
-        Post-process AI response: remove links to companies that don't exist in DB.
+        Post-process AI response: fix or remove links to companies that don't exist in DB.
        This is the ONLY reliable way to prevent hallucinated company names.
        """
        import re

        valid_companies = NordaBizChatEngine._get_valid_company_slugs()
        valid_slugs = set(valid_companies.keys())
-        valid_names_lower = {name.lower(): name for name in valid_companies.values()}
+        # Map: lowercase name → slug
+        name_to_slug = {}
+        for slug, name in valid_companies.items():
+            name_to_slug[name.lower()] = slug

-        # 1. Validate markdown links to /firma/slug — remove if slug doesn't exist
+        # 1. Validate markdown links to /firma/slug — fix or remove
        def replace_link(match):
            link_text = match.group(1)
            slug = match.group(2)
+            # Try exact match first
            if slug in valid_slugs:
-                return match.group(0)  # Keep valid link
-            else:
-                logger.warning(f"NordaGPT hallucination blocked: removed link to non-existent company slug '{slug}' (text: '{link_text}')")
-                return ''  # Remove entire link
+                return match.group(0)
+            # Try fuzzy slug match (AI often adds legal suffix)
+            correct_slug = NordaBizChatEngine._find_correct_slug(slug, valid_slugs, name_to_slug)
+            if correct_slug:
+                logger.info(f"NordaGPT slug corrected: '{slug}' → '{correct_slug}'")
+                return f'[{link_text}](/firma/{correct_slug})'
+            # Try matching by link text (company name)
+            name_slug = name_to_slug.get(link_text.lower().strip())
+            if name_slug:
+                logger.info(f"NordaGPT slug resolved by name: '{link_text}' → '{name_slug}'")
+                return f'[{link_text}](/firma/{name_slug})'
+            # No match — hallucination, keep just the text without link
+            logger.warning(f"NordaGPT hallucination blocked: '{link_text}' (slug: '{slug}') not in DB")
+            return link_text  # Keep text, remove link

        text = re.sub(r'\[([^\]]+)\]\(/firma/([a-z0-9-]+)\)', replace_link, text)

-        # 2. Validate pill-style links that the frontend renders
+        # 2. Validate pill-style links
        def replace_pill_link(match):
            full_match = match.group(0)
            slug = match.group(1)
+            link_text = match.group(2) if match.lastindex >= 2 else slug
            if slug in valid_slugs:
                return full_match
-            else:
-                logger.warning(f"NordaGPT hallucination blocked: removed pill link to '{slug}'")
-                return ''
+            correct_slug = NordaBizChatEngine._find_correct_slug(slug, valid_slugs, name_to_slug)
+            if correct_slug:
+                return full_match.replace(f'/firma/{slug}', f'/firma/{correct_slug}')
+            logger.warning(f"NordaGPT hallucination blocked: pill link '{slug}' not in DB")
+            return link_text

-        text = re.sub(r'<a[^>]*href=["\']/firma/([a-z0-9-]+)["\'][^>]*>.*?</a>', replace_pill_link, text)
+        text = re.sub(r'<a[^>]*href=["\']/firma/([a-z0-9-]+)["\'][^>]*>(.*?)</a>', replace_pill_link, text)

-        # 3. Clean up empty list items and double spaces left by removals
+        # 3. Clean up artifacts left by removals
+        text = re.sub(r'\*\s*–\s*\n', '\n', text)     # "* – " (bullet with removed company)
+        text = re.sub(r'\*\s*oraz\s*–', '*', text)     # "* oraz –" fragments
        text = re.sub(r'\n\s*\*\s*\n', '\n', text)     # empty bullet points
        text = re.sub(r'\n\s*-\s*\n', '\n', text)      # empty list items
        text = re.sub(r'  +', ' ', text)                # double spaces