feat: geographic proximity scoring for website discovery
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
Prioritize results from Wejherowo region (Norda Biznes home area): - Wejherowo: +3 pts - Powiat wejherowski (Reda, Rumia, Luzino...): +2 pts - Województwo pomorskie: +1 pt - Outside region: 0 pts Dashboard shows colored geo badge per candidate. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
0ace87e346
commit
d8a0485986
@ -255,6 +255,9 @@ def admin_data_quality():
|
||||
'match_domain': WebsiteDiscoveryService()._domain_matches_company(
|
||||
dc.candidate_domain or '', comp.name
|
||||
),
|
||||
'match_geo': WebsiteDiscoveryService()._compute_geo_proximity(
|
||||
dc.page_text_snippet or ''
|
||||
),
|
||||
})
|
||||
|
||||
# Count companies without website
|
||||
|
||||
@ -45,6 +45,19 @@ DIRECTORY_DOMAINS = {
|
||||
'wikipedia.org', 'olx.pl', 'allegro.pl',
|
||||
}
|
||||
|
||||
# Geographic proximity scoring for Norda Biznes (Wejherowo region)
|
||||
POWIAT_WEJHEROWSKI = {
|
||||
'wejherowo', 'reda', 'rumia', 'luzino', 'gniewino', 'szemud',
|
||||
'łęczyce', 'linia', 'choczewo', 'góra', 'bolszewo', 'gościcino',
|
||||
'nowy dwór wejherowski', 'kąpino', 'bieszkowice', 'sopieszyno',
|
||||
}
|
||||
WOJEWODZTWO_POMORSKIE = {
|
||||
'gdańsk', 'gdynia', 'sopot', 'słupsk', 'tczew', 'starogard gdański',
|
||||
'chojnice', 'malbork', 'kwidzyn', 'lębork', 'bytów', 'kartuzy',
|
||||
'kościerzyna', 'puck', 'żukowo', 'pruszcz gdański', 'ustka',
|
||||
'władysławowo', 'hel', 'jastarnia', 'łeba',
|
||||
}
|
||||
|
||||
|
||||
# --- Extraction helpers ---
|
||||
|
||||
@ -503,8 +516,35 @@ class WebsiteDiscoveryService:
|
||||
else:
|
||||
signals['owner'] = False
|
||||
|
||||
# Geographic proximity (weight varies: 3/2/1)
|
||||
signals['geo'] = self._compute_geo_proximity(text, url=None)
|
||||
|
||||
return signals
|
||||
|
||||
def _compute_geo_proximity(self, page_text, url=None):
|
||||
"""Score geographic proximity to Wejherowo region.
|
||||
Returns: 'wejherowo' (3pt), 'powiat' (2pt), 'pomorskie' (1pt), or False.
|
||||
"""
|
||||
text = (page_text or '').lower()
|
||||
if not text:
|
||||
return False
|
||||
|
||||
# Check Wejherowo first (highest priority)
|
||||
if 'wejherowo' in text:
|
||||
return 'wejherowo'
|
||||
|
||||
# Check powiat wejherowski cities
|
||||
for city in POWIAT_WEJHEROWSKI:
|
||||
if city in text:
|
||||
return 'powiat'
|
||||
|
||||
# Check województwo pomorskie
|
||||
for city in WOJEWODZTWO_POMORSKIE:
|
||||
if city in text:
|
||||
return 'pomorskie'
|
||||
|
||||
return False
|
||||
|
||||
def _domain_matches_company(self, domain, company_name):
|
||||
"""Check if domain name matches company name (handles word reordering)."""
|
||||
if not domain or not company_name:
|
||||
@ -549,6 +589,15 @@ class WebsiteDiscoveryService:
|
||||
}
|
||||
score = sum(weights.get(k, 0) for k, v in signals.items() if v)
|
||||
|
||||
# Geographic proximity bonus
|
||||
geo = signals.get('geo')
|
||||
if geo == 'wejherowo':
|
||||
score += 3
|
||||
elif geo == 'powiat':
|
||||
score += 2
|
||||
elif geo == 'pomorskie':
|
||||
score += 1
|
||||
|
||||
if score >= 5:
|
||||
return 'high', score
|
||||
elif score >= 2:
|
||||
|
||||
@ -714,6 +714,15 @@
|
||||
<span class="disc-badge {% if d.match_owner %}disc-match{% else %}disc-miss{% endif %}">Właściciel</span>
|
||||
{% endif %}
|
||||
<span class="disc-badge {% if d.match_domain %}disc-match{% else %}disc-miss{% endif %}">Domena</span>
|
||||
{% if d.match_geo == 'wejherowo' %}
|
||||
<span class="disc-badge disc-match">Wejherowo</span>
|
||||
{% elif d.match_geo == 'powiat' %}
|
||||
<span class="disc-badge disc-match" style="background: #fef3c7; color: #92400e;">Powiat</span>
|
||||
{% elif d.match_geo == 'pomorskie' %}
|
||||
<span class="disc-badge disc-match" style="background: #e0e7ff; color: #3730a3;">Pomorskie</span>
|
||||
{% else %}
|
||||
<span class="disc-badge disc-miss">Lokalizacja</span>
|
||||
{% endif %}
|
||||
</div>
|
||||
</td>
|
||||
<td>
|
||||
|
||||
Loading…
Reference in New Issue
Block a user