nordabiz/blueprints/admin/routes_data_quality.py
Maciej Pienczyn 93e90b2c72
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
feat: add data quality dashboard, auto-scoring, bulk enrichment and GBP data flow
- Extract 12-field completeness scoring to utils/data_quality.py service
- Auto-update data_quality_score and data_quality label on company data changes
- Add /admin/data-quality dashboard with field coverage stats, quality distribution, and sortable company table
- Add bulk enrichment with background processing, step selection, and progress tracking
- Flow GBP phone/website to Company record when company fields are empty
- Display Google opening hours on public company profile
- Add BulkEnrichmentJob model and migration 075
- Refactor arm_company.py to support selective steps and progress callbacks

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-21 07:02:45 +01:00

185 lines
5.5 KiB
Python

"""
Admin Data Quality Dashboard
=============================
Aggregate view of company data quality and completeness across all companies.
"""
import os
import logging
from datetime import datetime
from flask import render_template
from flask_login import login_required
from sqlalchemy import func
from . import bp
from database import (
SessionLocal, Company, CompanyWebsiteAnalysis,
CompanySocialMedia, GBPAudit, SystemRole
)
from utils.decorators import role_required
logger = logging.getLogger(__name__)
LOGO_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'static', 'img', 'companies')
def _check_logo_exists(slug):
"""Check if company logo file exists on disk."""
if not slug:
return False
for ext in ('webp', 'svg'):
if os.path.isfile(os.path.join(LOGO_DIR, f'{slug}.{ext}')):
return True
return False
@bp.route('/data-quality')
@login_required
@role_required(SystemRole.ADMIN)
def admin_data_quality():
"""Data quality dashboard with aggregate stats."""
db = SessionLocal()
try:
now = datetime.now()
# Load all active/pending companies with minimal fields
companies = db.query(Company).filter(
Company.status.in_(['active', 'pending'])
).order_by(Company.name).all()
total = len(companies)
if total == 0:
return render_template(
'admin/data_quality_dashboard.html',
total=0, field_stats={}, quality_dist={},
score_dist={}, avg_score=0, companies_table=[],
now=now,
)
# Batch query: companies with SEO analysis
seo_company_ids = set(
row[0] for row in db.query(CompanyWebsiteAnalysis.company_id).all()
)
# Batch query: companies with social media profiles
social_counts = dict(
db.query(
CompanySocialMedia.company_id,
func.count(CompanySocialMedia.id)
).group_by(CompanySocialMedia.company_id).all()
)
# Batch query: companies with GBP audit
gbp_company_ids = set(
row[0] for row in db.query(GBPAudit.company_id).distinct().all()
)
# Per-field coverage counters
field_counters = {
'NIP': 0,
'Adres': 0,
'Telefon': 0,
'Email': 0,
'Strona WWW': 0,
'Opis': 0,
'Kategoria': 0,
'Logo': 0,
'Dane urzędowe': 0,
'Audyt SEO': 0,
'Audyt Social': 0,
'Audyt GBP': 0,
}
# Quality distribution
quality_dist = {'basic': 0, 'enhanced': 0, 'complete': 0}
score_dist = {'0-25': 0, '26-50': 0, '51-75': 0, '76-100': 0}
score_sum = 0
# Per-company table data
companies_table = []
for c in companies:
# Compute 12-field check
fields = {
'NIP': bool(c.nip),
'Adres': bool(c.address_city),
'Telefon': bool(c.phone),
'Email': bool(c.email),
'Strona WWW': bool(c.website),
'Opis': bool(c.description_short),
'Kategoria': bool(c.category_id),
'Logo': _check_logo_exists(c.slug),
'Dane urzędowe': bool(c.ceidg_fetched_at or c.krs_fetched_at),
'Audyt SEO': c.id in seo_company_ids,
'Audyt Social': social_counts.get(c.id, 0) > 0,
'Audyt GBP': c.id in gbp_company_ids,
}
filled = sum(fields.values())
score = int(filled / len(fields) * 100)
# Update counters
for field_name, has_value in fields.items():
if has_value:
field_counters[field_name] += 1
# Quality label
if score < 34:
label = 'basic'
elif score < 67:
label = 'enhanced'
else:
label = 'complete'
quality_dist[label] = quality_dist.get(label, 0) + 1
# Score distribution
if score <= 25:
score_dist['0-25'] += 1
elif score <= 50:
score_dist['26-50'] += 1
elif score <= 75:
score_dist['51-75'] += 1
else:
score_dist['76-100'] += 1
score_sum += score
companies_table.append({
'id': c.id,
'name': c.name,
'slug': c.slug,
'score': score,
'filled': filled,
'total': len(fields),
'label': label,
'data_quality': c.data_quality or 'basic',
'fields': fields,
'status': c.status,
})
# Sort by score ascending (most incomplete first)
companies_table.sort(key=lambda x: x['score'])
# Field stats as percentages
field_stats = {
name: {'count': count, 'pct': round(count / total * 100)}
for name, count in field_counters.items()
}
avg_score = round(score_sum / total) if total > 0 else 0
return render_template(
'admin/data_quality_dashboard.html',
total=total,
field_stats=field_stats,
quality_dist=quality_dist,
score_dist=score_dist,
avg_score=avg_score,
companies_table=companies_table,
now=now,
)
finally:
db.close()