feat: geographic proximity scoring for website discovery
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions

Prioritize results from Wejherowo region (Norda Biznes home area):
- Wejherowo: +3 pts
- Powiat wejherowski (Reda, Rumia, Luzino...): +2 pts
- Województwo pomorskie: +1 pt
- Outside region: 0 pts

Dashboard shows colored geo badge per candidate.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Maciej Pienczyn 2026-02-21 10:15:40 +01:00
parent 0ace87e346
commit d8a0485986
3 changed files with 61 additions and 0 deletions

View File

@ -255,6 +255,9 @@ def admin_data_quality():
'match_domain': WebsiteDiscoveryService()._domain_matches_company(
dc.candidate_domain or '', comp.name
),
'match_geo': WebsiteDiscoveryService()._compute_geo_proximity(
dc.page_text_snippet or ''
),
})
# Count companies without website

View File

@ -45,6 +45,19 @@ DIRECTORY_DOMAINS = {
'wikipedia.org', 'olx.pl', 'allegro.pl',
}
# Geographic proximity scoring for Norda Biznes (Wejherowo region)
POWIAT_WEJHEROWSKI = {
'wejherowo', 'reda', 'rumia', 'luzino', 'gniewino', 'szemud',
'łęczyce', 'linia', 'choczewo', 'góra', 'bolszewo', 'gościcino',
'nowy dwór wejherowski', 'kąpino', 'bieszkowice', 'sopieszyno',
}
WOJEWODZTWO_POMORSKIE = {
'gdańsk', 'gdynia', 'sopot', 'słupsk', 'tczew', 'starogard gdański',
'chojnice', 'malbork', 'kwidzyn', 'lębork', 'bytów', 'kartuzy',
'kościerzyna', 'puck', 'żukowo', 'pruszcz gdański', 'ustka',
'władysławowo', 'hel', 'jastarnia', 'łeba',
}
# --- Extraction helpers ---
@ -503,8 +516,35 @@ class WebsiteDiscoveryService:
else:
signals['owner'] = False
# Geographic proximity (weight varies: 3/2/1)
signals['geo'] = self._compute_geo_proximity(text, url=None)
return signals
def _compute_geo_proximity(self, page_text, url=None):
"""Score geographic proximity to Wejherowo region.
Returns: 'wejherowo' (3pt), 'powiat' (2pt), 'pomorskie' (1pt), or False.
"""
text = (page_text or '').lower()
if not text:
return False
# Check Wejherowo first (highest priority)
if 'wejherowo' in text:
return 'wejherowo'
# Check powiat wejherowski cities
for city in POWIAT_WEJHEROWSKI:
if city in text:
return 'powiat'
# Check województwo pomorskie
for city in WOJEWODZTWO_POMORSKIE:
if city in text:
return 'pomorskie'
return False
def _domain_matches_company(self, domain, company_name):
"""Check if domain name matches company name (handles word reordering)."""
if not domain or not company_name:
@ -549,6 +589,15 @@ class WebsiteDiscoveryService:
}
score = sum(weights.get(k, 0) for k, v in signals.items() if v)
# Geographic proximity bonus
geo = signals.get('geo')
if geo == 'wejherowo':
score += 3
elif geo == 'powiat':
score += 2
elif geo == 'pomorskie':
score += 1
if score >= 5:
return 'high', score
elif score >= 2:

View File

@ -714,6 +714,15 @@
<span class="disc-badge {% if d.match_owner %}disc-match{% else %}disc-miss{% endif %}">Właściciel</span>
{% endif %}
<span class="disc-badge {% if d.match_domain %}disc-match{% else %}disc-miss{% endif %}">Domena</span>
{% if d.match_geo == 'wejherowo' %}
<span class="disc-badge disc-match">Wejherowo</span>
{% elif d.match_geo == 'powiat' %}
<span class="disc-badge disc-match" style="background: #fef3c7; color: #92400e;">Powiat</span>
{% elif d.match_geo == 'pomorskie' %}
<span class="disc-badge disc-match" style="background: #e0e7ff; color: #3730a3;">Pomorskie</span>
{% else %}
<span class="disc-badge disc-miss">Lokalizacja</span>
{% endif %}
</div>
</td>
<td>