From 601bd995596f810f9b1d31e1bbf693c95bf8f38e Mon Sep 17 00:00:00 2001 From: Maciej Pienczyn Date: Sat, 21 Feb 2026 10:24:55 +0100 Subject: [PATCH] feat: remember rejected candidates, skip in future bulk discovery - Bulk discovery skips companies with any candidate (including rejected) - Single discovery skips URLs from previously rejected domains - Dashboard shows list of companies rejected by admin with note that they won't be re-searched in bulk mode Co-Authored-By: Claude Opus 4.6 --- blueprints/admin/routes_data_quality.py | 24 ++++++++++++++++++++ blueprints/admin/routes_website_discovery.py | 4 ++-- services/website_discovery_service.py | 9 +++++++- templates/admin/data_quality_dashboard.html | 14 ++++++++++++ 4 files changed, 48 insertions(+), 3 deletions(-) diff --git a/blueprints/admin/routes_data_quality.py b/blueprints/admin/routes_data_quality.py index 76a5307..aba3691 100644 --- a/blueprints/admin/routes_data_quality.py +++ b/blueprints/admin/routes_data_quality.py @@ -260,6 +260,29 @@ def admin_data_quality(): ), }) + # Companies with rejected candidates (already reviewed) + rejected_company_ids = set( + r[0] for r in db.query(WebsiteDiscoveryCandidate.company_id).filter( + WebsiteDiscoveryCandidate.status == 'rejected' + ).distinct().all() + ) + # Exclude companies that also have pending/accepted candidates + active_candidate_ids = set( + r[0] for r in db.query(WebsiteDiscoveryCandidate.company_id).filter( + WebsiteDiscoveryCandidate.status.in_(['pending', 'accepted']) + ).distinct().all() + ) + only_rejected_ids = rejected_company_ids - active_candidate_ids + rejected_companies = [] + for cid in only_rejected_ids: + comp = company_map.get(cid) + if comp and not comp.website: + rejected_companies.append({ + 'company_name': comp.name, + 'company_id': cid, + }) + rejected_companies.sort(key=lambda x: x['company_name']) + # Count companies without website companies_without_website = sum(1 for c in companies_table if not c['website']) @@ -273,6 +296,7 @@ def admin_data_quality(): companies_table=companies_table, available_data=available_data, discovery_data=discovery_data, + rejected_companies=rejected_companies, companies_without_website=companies_without_website, now=now, ) diff --git a/blueprints/admin/routes_website_discovery.py b/blueprints/admin/routes_website_discovery.py index c06c0a0..84b5d24 100644 --- a/blueprints/admin/routes_website_discovery.py +++ b/blueprints/admin/routes_website_discovery.py @@ -80,10 +80,10 @@ def discover_websites_bulk(): _save_job(job_id, job) db = SessionLocal() try: - # Skip companies that already have a pending/accepted candidate + # Skip companies that already have any candidate (pending/accepted/rejected) already_have = set( r[0] for r in db.query(WebsiteDiscoveryCandidate.company_id).filter( - WebsiteDiscoveryCandidate.status.in_(['pending', 'accepted']) + WebsiteDiscoveryCandidate.status.in_(['pending', 'accepted', 'rejected']) ).distinct().all() ) diff --git a/services/website_discovery_service.py b/services/website_discovery_service.py index f512163..f05aabc 100644 --- a/services/website_discovery_service.py +++ b/services/website_discovery_service.py @@ -288,12 +288,19 @@ class WebsiteDiscoveryService: if domain.startswith('www.'): domain = domain[4:] - # Check for existing candidate with this URL + # Check for existing candidate (exact URL or same domain rejected) existing = db.query(WebsiteDiscoveryCandidate).filter_by( company_id=company.id, candidate_url=url ).first() if existing: continue + rejected_domain = db.query(WebsiteDiscoveryCandidate).filter( + WebsiteDiscoveryCandidate.company_id == company.id, + WebsiteDiscoveryCandidate.candidate_domain == domain, + WebsiteDiscoveryCandidate.status == 'rejected', + ).first() + if rejected_domain: + continue # Fetch root + common subpages for verification data all_text = '' diff --git a/templates/admin/data_quality_dashboard.html b/templates/admin/data_quality_dashboard.html index 4dbcadc..7273648 100644 --- a/templates/admin/data_quality_dashboard.html +++ b/templates/admin/data_quality_dashboard.html @@ -750,6 +750,20 @@ {% else %}

Brak kandydatów. Kliknij "Szukaj WWW" aby uruchomić wyszukiwanie.

{% endif %} + + {% if rejected_companies %} +
+ + Odrzucone przez admina ({{ rejected_companies|length }}): + + + {% for rc in rejected_companies %}{{ rc.company_name }}{% if not loop.last %}, {% endif %}{% endfor %} + + + Te firmy nie będą ponownie wyszukiwane w trybie zbiorczym. + +
+ {% endif %}