nordabiz/blueprints/admin/routes_data_quality.py
Maciej Pienczyn e0bb6b718a
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
feat: enhance data quality dashboard with filters, hints, weighted scores and contact scraping
- Add clickable field coverage bars to filter companies missing specific data
- Add quick-action buttons (Registry/SEO/GBP) per company in dashboard table
- Add stale data detection (>6 months) with yellow badges
- Implement weighted priority score (contacts 34%, audits 17%)
- Add data hints in admin company detail showing where to find missing data
- Add "Available data" section showing Google Business data ready to apply
- Add POST /api/company/<id>/apply-hint endpoint for one-click data fill
- Extend website content updater with phone/email extraction (AI + regex)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-21 07:25:39 +01:00

223 lines
7.3 KiB
Python

"""
Admin Data Quality Dashboard
=============================
Aggregate view of company data quality and completeness across all companies.
"""
import os
import logging
from datetime import datetime
from flask import render_template
from flask_login import login_required
from sqlalchemy import func
from . import bp
from database import (
SessionLocal, Company, CompanyWebsiteAnalysis,
CompanySocialMedia, GBPAudit, SystemRole
)
from utils.decorators import role_required
from utils.data_quality import compute_weighted_score
logger = logging.getLogger(__name__)
LOGO_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'static', 'img', 'companies')
def _check_logo_exists(slug):
"""Check if company logo file exists on disk."""
if not slug:
return False
for ext in ('webp', 'svg'):
if os.path.isfile(os.path.join(LOGO_DIR, f'{slug}.{ext}')):
return True
return False
@bp.route('/data-quality')
@login_required
@role_required(SystemRole.ADMIN)
def admin_data_quality():
"""Data quality dashboard with aggregate stats."""
db = SessionLocal()
try:
now = datetime.now()
# Load all active/pending companies with minimal fields
companies = db.query(Company).filter(
Company.status.in_(['active', 'pending'])
).order_by(Company.name).all()
total = len(companies)
if total == 0:
return render_template(
'admin/data_quality_dashboard.html',
total=0, field_stats={}, quality_dist={},
score_dist={}, avg_score=0, companies_table=[],
now=now,
)
# Batch query: companies with SEO analysis
seo_company_ids = set(
row[0] for row in db.query(CompanyWebsiteAnalysis.company_id).all()
)
# Batch query: companies with social media profiles
social_counts = dict(
db.query(
CompanySocialMedia.company_id,
func.count(CompanySocialMedia.id)
).group_by(CompanySocialMedia.company_id).all()
)
# Batch query: companies with GBP audit
gbp_company_ids = set(
row[0] for row in db.query(GBPAudit.company_id).distinct().all()
)
# Per-field coverage counters
field_counters = {
'NIP': 0,
'Adres': 0,
'Telefon': 0,
'Email': 0,
'Strona WWW': 0,
'Opis': 0,
'Kategoria': 0,
'Logo': 0,
'Dane urzędowe': 0,
'Audyt SEO': 0,
'Audyt Social': 0,
'Audyt GBP': 0,
}
# Quality distribution
quality_dist = {'basic': 0, 'enhanced': 0, 'complete': 0}
score_dist = {'0-25': 0, '26-50': 0, '51-75': 0, '76-100': 0}
score_sum = 0
# Per-company table data
companies_table = []
for c in companies:
# Compute 12-field check
fields = {
'NIP': bool(c.nip),
'Adres': bool(c.address_city),
'Telefon': bool(c.phone),
'Email': bool(c.email),
'Strona WWW': bool(c.website),
'Opis': bool(c.description_short),
'Kategoria': bool(c.category_id),
'Logo': _check_logo_exists(c.slug),
'Dane urzędowe': bool(c.ceidg_fetched_at or c.krs_fetched_at),
'Audyt SEO': c.id in seo_company_ids,
'Audyt Social': social_counts.get(c.id, 0) > 0,
'Audyt GBP': c.id in gbp_company_ids,
}
filled = sum(fields.values())
score = compute_weighted_score(fields)
# Update counters
for field_name, has_value in fields.items():
if has_value:
field_counters[field_name] += 1
# Quality label
if score < 34:
label = 'basic'
elif score < 67:
label = 'enhanced'
else:
label = 'complete'
quality_dist[label] = quality_dist.get(label, 0) + 1
# Score distribution
if score <= 25:
score_dist['0-25'] += 1
elif score <= 50:
score_dist['26-50'] += 1
elif score <= 75:
score_dist['51-75'] += 1
else:
score_dist['76-100'] += 1
score_sum += score
# Stale data detection
registry_done = fields['Dane urzędowe']
registry_date = c.krs_fetched_at or c.ceidg_fetched_at
registry_stale = registry_done and (
(not registry_date) or ((now - registry_date).days > 180)
)
companies_table.append({
'id': c.id,
'name': c.name,
'slug': c.slug,
'score': score,
'filled': filled,
'total': len(fields),
'label': label,
'data_quality': c.data_quality or 'basic',
'fields': fields,
'status': c.status,
'nip': c.nip or '',
'website': c.website or '',
'registry_stale': registry_stale,
'registry_date': registry_date,
})
# Sort by score ascending (most incomplete first)
companies_table.sort(key=lambda x: x['score'])
# Field stats as percentages
field_stats = {
name: {'count': count, 'pct': round(count / total * 100)}
for name, count in field_counters.items()
}
avg_score = round(score_sum / total) if total > 0 else 0
# Available data: companies where Google has data but company profile is empty
available_data = []
analyses = db.query(CompanyWebsiteAnalysis).all()
company_map = {c.id: c for c in companies}
for a in analyses:
comp = company_map.get(a.company_id)
if not comp:
continue
if a.google_phone and not comp.phone:
available_data.append({
'company_id': comp.id, 'company_name': comp.name, 'company_slug': comp.slug,
'field': 'Telefon', 'source': 'Google Business', 'value': a.google_phone
})
if a.google_website and not comp.website:
available_data.append({
'company_id': comp.id, 'company_name': comp.name, 'company_slug': comp.slug,
'field': 'Strona WWW', 'source': 'Google Business', 'value': a.google_website
})
if a.google_address and not comp.address_city:
available_data.append({
'company_id': comp.id, 'company_name': comp.name, 'company_slug': comp.slug,
'field': 'Adres', 'source': 'Google Business', 'value': a.google_address
})
return render_template(
'admin/data_quality_dashboard.html',
total=total,
field_stats=field_stats,
quality_dist=quality_dist,
score_dist=score_dist,
avg_score=avg_score,
companies_table=companies_table,
available_data=available_data,
now=now,
)
finally:
db.close()