feat: scrape subpages (kontakt, o-nas) for NIP/REGON verification
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions

Root page often lacks NIP/REGON. Now scrapes /kontakt/, /contact,
/o-nas, /o-firmie to find strong verification signals. Stops early
when NIP/REGON/KRS found.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Maciej Pienczyn 2026-02-21 09:23:50 +01:00
parent 880f5a6715
commit 11184c5a58

View File

@ -282,23 +282,43 @@ class WebsiteDiscoveryService:
if existing:
continue
# Fetch and extract
page_text = _fetch_page_text(url)
# Fetch root + common subpages for verification data
all_text = ''
extracted = {
'nips': [], 'regons': [], 'krs': [],
'emails': [], 'phones': [], 'text_snippet': '',
}
if page_text:
extracted = {
'nips': _find_nips_in_text(page_text),
'regons': _find_regons_in_text(page_text),
'krs': _find_krs_in_text(page_text),
'emails': _extract_emails(page_text),
'phones': _extract_phones(page_text),
'text_snippet': page_text[:500],
}
else:
extracted = {
'nips': [], 'regons': [], 'krs': [],
'emails': [], 'phones': [], 'text_snippet': '',
}
subpages = ['', 'kontakt', 'kontakt/', 'contact', 'o-nas', 'o-firmie']
for sub in subpages:
sub_url = url.rstrip('/') + '/' + sub if sub else url
text = _fetch_page_text(sub_url)
if not text:
continue
all_text = all_text + ' ' + text if all_text else text
if not extracted['text_snippet']:
extracted['text_snippet'] = text[:500]
# Merge extracted data (deduplicated)
for nip in _find_nips_in_text(text):
if nip not in extracted['nips']:
extracted['nips'].append(nip)
for regon in _find_regons_in_text(text):
if regon not in extracted['regons']:
extracted['regons'].append(regon)
for krs in _find_krs_in_text(text):
if krs not in extracted['krs']:
extracted['krs'].append(krs)
for email in _extract_emails(text):
if email not in extracted['emails']:
extracted['emails'].append(email)
for phone in _extract_phones(text):
if phone not in extracted['phones']:
extracted['phones'].append(phone)
# Stop scanning subpages if we already found strong signals
if extracted['nips'] or extracted['regons'] or extracted['krs']:
break
page_text = all_text or None
# Compute match signals
signals = self._compute_signals(extracted, company, page_text)