fix: normalize discovery URLs to root domain

Strip paths from candidate URLs (e.g. /kontakt/, /about/) to always save root domain. Deduplicates results pointing to same domain. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-21 09:09:50 +01:00 · 2026-02-21 09:09:50 +01:00 · 880f5a6715
commit 880f5a6715
parent ced2d0337e
1 changed files with 21 additions and 1 deletions
--- a/services/website_discovery_service.py
+++ b/services/website_discovery_service.py
@ -164,6 +164,19 @@ def _extract_phones(text):
    return list(dict.fromkeys(phones))[:5]


+def _normalize_url_to_root(url):
+    """Strip path from URL, keep only scheme + domain (root page)."""
+    try:
+        parsed = urlparse(url)
+        scheme = parsed.scheme or 'https'
+        netloc = parsed.netloc
+        if not netloc:
+            return url
+        return f'{scheme}://{netloc}/'
+    except Exception:
+        return url
+
+
 def _is_directory_domain(url):
    """Check if URL belongs to a known business directory."""
    try:
@ -248,9 +261,16 @@ class WebsiteDiscoveryService:
            # Evaluate top 3 candidates, pick the best
            best_candidate = None
            best_score = -1
+            seen_urls = set()

            for brave_result in urls[:3]:
-                url = brave_result['url']
+                url = _normalize_url_to_root(brave_result['url'])
+
+                # Skip duplicate root URLs (e.g. /kontakt/ and /about/ on same domain)
+                if url in seen_urls:
+                    continue
+                seen_urls.add(url)
+
                domain = urlparse(url).netloc.lower()
                if domain.startswith('www.'):
                    domain = domain[4:]