From 880f5a67156ae36f0050298d3ffee6529df4840a Mon Sep 17 00:00:00 2001 From: Maciej Pienczyn Date: Sat, 21 Feb 2026 09:09:50 +0100 Subject: [PATCH] fix: normalize discovery URLs to root domain Strip paths from candidate URLs (e.g. /kontakt/, /about/) to always save root domain. Deduplicates results pointing to same domain. Co-Authored-By: Claude Opus 4.6 --- services/website_discovery_service.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/services/website_discovery_service.py b/services/website_discovery_service.py index d397da9..8e2595b 100644 --- a/services/website_discovery_service.py +++ b/services/website_discovery_service.py @@ -164,6 +164,19 @@ def _extract_phones(text): return list(dict.fromkeys(phones))[:5] +def _normalize_url_to_root(url): + """Strip path from URL, keep only scheme + domain (root page).""" + try: + parsed = urlparse(url) + scheme = parsed.scheme or 'https' + netloc = parsed.netloc + if not netloc: + return url + return f'{scheme}://{netloc}/' + except Exception: + return url + + def _is_directory_domain(url): """Check if URL belongs to a known business directory.""" try: @@ -248,9 +261,16 @@ class WebsiteDiscoveryService: # Evaluate top 3 candidates, pick the best best_candidate = None best_score = -1 + seen_urls = set() for brave_result in urls[:3]: - url = brave_result['url'] + url = _normalize_url_to_root(brave_result['url']) + + # Skip duplicate root URLs (e.g. /kontakt/ and /about/ on same domain) + if url in seen_urls: + continue + seen_urls.add(url) + domain = urlparse(url).netloc.lower() if domain.startswith('www.'): domain = domain[4:]