Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
Evaluate top 3 Brave results instead of just taking the first one. Add domain name matching signal (+2 pts when domain contains company name). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
278 lines
9.7 KiB
Python
278 lines
9.7 KiB
Python
"""
|
|
Admin Data Quality Dashboard
|
|
=============================
|
|
|
|
Aggregate view of company data quality and completeness across all companies.
|
|
"""
|
|
|
|
import os
|
|
import logging
|
|
from datetime import datetime
|
|
|
|
from flask import render_template
|
|
from flask_login import login_required
|
|
from sqlalchemy import func
|
|
|
|
from . import bp
|
|
from database import (
|
|
SessionLocal, Company, CompanyWebsiteAnalysis,
|
|
CompanySocialMedia, GBPAudit, SystemRole,
|
|
WebsiteDiscoveryCandidate
|
|
)
|
|
from utils.decorators import role_required
|
|
from utils.data_quality import compute_weighted_score
|
|
from services.website_discovery_service import WebsiteDiscoveryService
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
LOGO_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'static', 'img', 'companies')
|
|
|
|
|
|
def _check_logo_exists(slug):
|
|
"""Check if company logo file exists on disk."""
|
|
if not slug:
|
|
return False
|
|
for ext in ('webp', 'svg'):
|
|
if os.path.isfile(os.path.join(LOGO_DIR, f'{slug}.{ext}')):
|
|
return True
|
|
return False
|
|
|
|
|
|
@bp.route('/data-quality')
|
|
@login_required
|
|
@role_required(SystemRole.ADMIN)
|
|
def admin_data_quality():
|
|
"""Data quality dashboard with aggregate stats."""
|
|
db = SessionLocal()
|
|
try:
|
|
now = datetime.now()
|
|
|
|
# Load all active/pending companies with minimal fields
|
|
companies = db.query(Company).filter(
|
|
Company.status.in_(['active', 'pending'])
|
|
).order_by(Company.name).all()
|
|
|
|
total = len(companies)
|
|
if total == 0:
|
|
return render_template(
|
|
'admin/data_quality_dashboard.html',
|
|
total=0, field_stats={}, quality_dist={},
|
|
score_dist={}, avg_score=0, companies_table=[],
|
|
now=now,
|
|
)
|
|
|
|
# Batch query: companies with SEO analysis
|
|
seo_company_ids = set(
|
|
row[0] for row in db.query(CompanyWebsiteAnalysis.company_id).all()
|
|
)
|
|
|
|
# Batch query: companies with social media profiles
|
|
social_counts = dict(
|
|
db.query(
|
|
CompanySocialMedia.company_id,
|
|
func.count(CompanySocialMedia.id)
|
|
).group_by(CompanySocialMedia.company_id).all()
|
|
)
|
|
|
|
# Batch query: companies with GBP audit
|
|
gbp_company_ids = set(
|
|
row[0] for row in db.query(GBPAudit.company_id).distinct().all()
|
|
)
|
|
|
|
# Per-field coverage counters
|
|
field_counters = {
|
|
'NIP': 0,
|
|
'Adres': 0,
|
|
'Telefon': 0,
|
|
'Email': 0,
|
|
'Strona WWW': 0,
|
|
'Opis': 0,
|
|
'Kategoria': 0,
|
|
'Logo': 0,
|
|
'Dane urzędowe': 0,
|
|
'Audyt SEO': 0,
|
|
'Audyt Social': 0,
|
|
'Audyt GBP': 0,
|
|
}
|
|
|
|
# Quality distribution
|
|
quality_dist = {'basic': 0, 'enhanced': 0, 'complete': 0}
|
|
score_dist = {'0-25': 0, '26-50': 0, '51-75': 0, '76-100': 0}
|
|
score_sum = 0
|
|
|
|
# Per-company table data
|
|
companies_table = []
|
|
|
|
for c in companies:
|
|
# Compute 12-field check
|
|
fields = {
|
|
'NIP': bool(c.nip),
|
|
'Adres': bool(c.address_city),
|
|
'Telefon': bool(c.phone),
|
|
'Email': bool(c.email),
|
|
'Strona WWW': bool(c.website),
|
|
'Opis': bool(c.description_short),
|
|
'Kategoria': bool(c.category_id),
|
|
'Logo': _check_logo_exists(c.slug),
|
|
'Dane urzędowe': bool(c.ceidg_fetched_at or c.krs_fetched_at),
|
|
'Audyt SEO': c.id in seo_company_ids,
|
|
'Audyt Social': social_counts.get(c.id, 0) > 0,
|
|
'Audyt GBP': c.id in gbp_company_ids,
|
|
}
|
|
|
|
filled = sum(fields.values())
|
|
score = compute_weighted_score(fields)
|
|
|
|
# Update counters
|
|
for field_name, has_value in fields.items():
|
|
if has_value:
|
|
field_counters[field_name] += 1
|
|
|
|
# Quality label
|
|
if score < 34:
|
|
label = 'basic'
|
|
elif score < 67:
|
|
label = 'enhanced'
|
|
else:
|
|
label = 'complete'
|
|
quality_dist[label] = quality_dist.get(label, 0) + 1
|
|
|
|
# Score distribution
|
|
if score <= 25:
|
|
score_dist['0-25'] += 1
|
|
elif score <= 50:
|
|
score_dist['26-50'] += 1
|
|
elif score <= 75:
|
|
score_dist['51-75'] += 1
|
|
else:
|
|
score_dist['76-100'] += 1
|
|
|
|
score_sum += score
|
|
|
|
# Stale data detection
|
|
registry_done = fields['Dane urzędowe']
|
|
registry_date = c.krs_fetched_at or c.ceidg_fetched_at
|
|
registry_stale = registry_done and (
|
|
(not registry_date) or ((now - registry_date).days > 180)
|
|
)
|
|
|
|
companies_table.append({
|
|
'id': c.id,
|
|
'name': c.name,
|
|
'slug': c.slug,
|
|
'score': score,
|
|
'filled': filled,
|
|
'total': len(fields),
|
|
'label': label,
|
|
'data_quality': c.data_quality or 'basic',
|
|
'fields': fields,
|
|
'status': c.status,
|
|
'nip': c.nip or '',
|
|
'website': c.website or '',
|
|
'registry_stale': registry_stale,
|
|
'registry_date': registry_date,
|
|
})
|
|
|
|
# Sort by score ascending (most incomplete first)
|
|
companies_table.sort(key=lambda x: x['score'])
|
|
|
|
# Field stats as percentages
|
|
field_stats = {
|
|
name: {'count': count, 'pct': round(count / total * 100)}
|
|
for name, count in field_counters.items()
|
|
}
|
|
|
|
avg_score = round(score_sum / total) if total > 0 else 0
|
|
|
|
# Available data: companies where Google has data but company profile is empty
|
|
# Include google_name so admin can verify the match is correct
|
|
available_data = []
|
|
analyses = db.query(CompanyWebsiteAnalysis).all()
|
|
company_map = {c.id: c for c in companies}
|
|
|
|
for a in analyses:
|
|
comp = company_map.get(a.company_id)
|
|
if not comp:
|
|
continue
|
|
g_name = a.google_name or ''
|
|
if a.google_phone and not comp.phone:
|
|
available_data.append({
|
|
'company_id': comp.id, 'company_name': comp.name, 'company_slug': comp.slug,
|
|
'field': 'Telefon', 'source': 'Google Business', 'value': a.google_phone,
|
|
'google_name': g_name,
|
|
})
|
|
if a.google_website and not comp.website:
|
|
available_data.append({
|
|
'company_id': comp.id, 'company_name': comp.name, 'company_slug': comp.slug,
|
|
'field': 'Strona WWW', 'source': 'Google Business', 'value': a.google_website,
|
|
'google_name': g_name,
|
|
})
|
|
if a.google_address and not comp.address_city:
|
|
available_data.append({
|
|
'company_id': comp.id, 'company_name': comp.name, 'company_slug': comp.slug,
|
|
'field': 'Adres', 'source': 'Google Business', 'value': a.google_address,
|
|
'google_name': g_name,
|
|
})
|
|
|
|
# Website discovery candidates (pending)
|
|
discovery_candidates = db.query(WebsiteDiscoveryCandidate).filter(
|
|
WebsiteDiscoveryCandidate.status == 'pending',
|
|
WebsiteDiscoveryCandidate.candidate_url != 'none',
|
|
).order_by(WebsiteDiscoveryCandidate.match_score.desc()).all()
|
|
|
|
# Enrich with company name
|
|
discovery_data = []
|
|
for dc in discovery_candidates:
|
|
comp = company_map.get(dc.company_id)
|
|
if not comp:
|
|
continue
|
|
discovery_data.append({
|
|
'id': dc.id,
|
|
'company_id': dc.company_id,
|
|
'company_name': comp.name,
|
|
'company_slug': comp.slug,
|
|
'url': dc.candidate_url,
|
|
'domain': dc.candidate_domain or '',
|
|
'title': dc.brave_title or '',
|
|
'brave_description': (dc.brave_description or '')[:120],
|
|
'snippet': (dc.page_text_snippet or '')[:500],
|
|
'match_nip': dc.match_nip,
|
|
'match_regon': dc.match_regon,
|
|
'match_krs': dc.match_krs,
|
|
'match_phone': dc.match_phone,
|
|
'match_email': dc.match_email,
|
|
'match_city': dc.match_city,
|
|
'match_owner': dc.match_owner,
|
|
'confidence': dc.confidence,
|
|
'score': dc.match_score,
|
|
'has_nip': bool(comp.nip),
|
|
'has_regon': bool(comp.regon),
|
|
'has_krs': bool(comp.krs),
|
|
'has_phone': bool(comp.phone),
|
|
'has_email': bool(comp.email),
|
|
'has_city': bool(comp.address_city),
|
|
'has_owner': bool(getattr(comp, 'owner_name', None)),
|
|
'match_domain': WebsiteDiscoveryService()._domain_matches_company(
|
|
dc.candidate_domain or '', comp.name
|
|
),
|
|
})
|
|
|
|
# Count companies without website
|
|
companies_without_website = sum(1 for c in companies_table if not c['website'])
|
|
|
|
return render_template(
|
|
'admin/data_quality_dashboard.html',
|
|
total=total,
|
|
field_stats=field_stats,
|
|
quality_dist=quality_dist,
|
|
score_dist=score_dist,
|
|
avg_score=avg_score,
|
|
companies_table=companies_table,
|
|
available_data=available_data,
|
|
discovery_data=discovery_data,
|
|
companies_without_website=companies_without_website,
|
|
now=now,
|
|
)
|
|
finally:
|
|
db.close()
|