fix: Remove sticky sidebar on account pages + add Companies to admin menu
Changes: - Remove position: sticky from konto sidebar (dane, prywatnosc, bezpieczenstwo, blokady) - Add "Firmy" link to admin dropdown menu (before "Użytkownicy") - Add scan_websites_for_nip.py script for data quality Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
a325e1b2e4
commit
99f7a5a88a
361
scripts/scan_websites_for_nip.py
Normal file
361
scripts/scan_websites_for_nip.py
Normal file
@ -0,0 +1,361 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Website NIP Scanner - skanuje strony www firm w poszukiwaniu NIP/REGON
|
||||
|
||||
Dla firm bez NIP w bazie - pobiera stronę www (z domeny email)
|
||||
i szuka numerów NIP/REGON w treści.
|
||||
|
||||
Usage:
|
||||
python scripts/scan_websites_for_nip.py # Skanuj wszystkie
|
||||
python scripts/scan_websites_for_nip.py --id 119 # Skanuj konkretną firmę
|
||||
python scripts/scan_websites_for_nip.py --apply # Zapisz znalezione NIP do bazy
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
import argparse
|
||||
import time
|
||||
import json
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from dataclasses import dataclass, asdict
|
||||
from typing import Optional, List, Tuple
|
||||
import requests
|
||||
from urllib.parse import urlparse
|
||||
|
||||
# Add parent directory to path for imports
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from database import SessionLocal, Company
|
||||
|
||||
# Output directory for scan results
|
||||
RESULTS_DIR = Path(__file__).parent.parent / "data" / "nip_scan_results"
|
||||
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Domains to skip (public email providers)
|
||||
SKIP_DOMAINS = {
|
||||
'gmail.com', 'wp.pl', 'onet.pl', 'op.pl', 'interia.pl',
|
||||
'o2.pl', 'poczta.fm', 'yahoo.com', 'hotmail.com', 'outlook.com'
|
||||
}
|
||||
|
||||
# Request timeout
|
||||
REQUEST_TIMEOUT = 15
|
||||
|
||||
# User agent
|
||||
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScanResult:
|
||||
"""Wynik skanowania strony www"""
|
||||
company_id: int
|
||||
company_name: str
|
||||
domain: str
|
||||
url_scanned: str
|
||||
nip_found: Optional[str] = None
|
||||
regon_found: Optional[str] = None
|
||||
nips_all: List[str] = None # Wszystkie znalezione NIP (może być wiele)
|
||||
regons_all: List[str] = None
|
||||
phone_found: Optional[str] = None
|
||||
address_found: Optional[str] = None
|
||||
confidence: str = "low" # low, medium, high
|
||||
error: Optional[str] = None
|
||||
scanned_at: str = ""
|
||||
|
||||
def __post_init__(self):
|
||||
if self.nips_all is None:
|
||||
self.nips_all = []
|
||||
if self.regons_all is None:
|
||||
self.regons_all = []
|
||||
if not self.scanned_at:
|
||||
self.scanned_at = datetime.now().isoformat()
|
||||
|
||||
def to_dict(self):
|
||||
return asdict(self)
|
||||
|
||||
|
||||
def extract_domain_from_email(email: str) -> Optional[str]:
|
||||
"""Wyciąga domenę z adresu email"""
|
||||
if not email or '@' not in email:
|
||||
return None
|
||||
domain = email.split('@')[1].lower()
|
||||
if domain in SKIP_DOMAINS:
|
||||
return None
|
||||
return domain
|
||||
|
||||
|
||||
def normalize_nip(nip: str) -> str:
|
||||
"""Normalizuje NIP do 10 cyfr"""
|
||||
return re.sub(r'[^0-9]', '', nip)
|
||||
|
||||
|
||||
def validate_nip(nip: str) -> bool:
|
||||
"""Waliduje NIP (checksum)"""
|
||||
nip = normalize_nip(nip)
|
||||
if len(nip) != 10:
|
||||
return False
|
||||
|
||||
weights = [6, 5, 7, 2, 3, 4, 5, 6, 7]
|
||||
try:
|
||||
checksum = sum(int(nip[i]) * weights[i] for i in range(9)) % 11
|
||||
return checksum == int(nip[9])
|
||||
except (ValueError, IndexError):
|
||||
return False
|
||||
|
||||
|
||||
def validate_regon(regon: str) -> bool:
|
||||
"""Waliduje REGON (9 lub 14 cyfr)"""
|
||||
regon = re.sub(r'[^0-9]', '', regon)
|
||||
|
||||
if len(regon) == 9:
|
||||
weights = [8, 9, 2, 3, 4, 5, 6, 7]
|
||||
checksum = sum(int(regon[i]) * weights[i] for i in range(8)) % 11
|
||||
if checksum == 10:
|
||||
checksum = 0
|
||||
return checksum == int(regon[8])
|
||||
elif len(regon) == 14:
|
||||
# Validate first 9 digits
|
||||
weights9 = [8, 9, 2, 3, 4, 5, 6, 7]
|
||||
checksum9 = sum(int(regon[i]) * weights9[i] for i in range(8)) % 11
|
||||
if checksum9 == 10:
|
||||
checksum9 = 0
|
||||
if checksum9 != int(regon[8]):
|
||||
return False
|
||||
|
||||
# Validate full 14 digits
|
||||
weights14 = [2, 4, 8, 5, 0, 9, 7, 3, 6, 1, 2, 4, 8]
|
||||
checksum14 = sum(int(regon[i]) * weights14[i] for i in range(13)) % 11
|
||||
if checksum14 == 10:
|
||||
checksum14 = 0
|
||||
return checksum14 == int(regon[13])
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def find_nips_in_text(text: str) -> List[str]:
|
||||
"""Znajduje wszystkie NIP-y w tekście"""
|
||||
# Patterns for NIP
|
||||
patterns = [
|
||||
r'NIP[:\s]*(\d{3}[-\s]?\d{3}[-\s]?\d{2}[-\s]?\d{2})', # NIP: 123-456-78-90
|
||||
r'NIP[:\s]*(\d{10})', # NIP: 1234567890
|
||||
r'numer\s+identyfikacji\s+podatkowej[:\s]*(\d{10})',
|
||||
]
|
||||
|
||||
nips = []
|
||||
for pattern in patterns:
|
||||
matches = re.findall(pattern, text, re.IGNORECASE)
|
||||
for match in matches:
|
||||
nip = normalize_nip(match)
|
||||
if validate_nip(nip) and nip not in nips:
|
||||
nips.append(nip)
|
||||
|
||||
return nips
|
||||
|
||||
|
||||
def find_regons_in_text(text: str) -> List[str]:
|
||||
"""Znajduje wszystkie REGON-y w tekście"""
|
||||
patterns = [
|
||||
r'REGON[:\s]*(\d{9,14})',
|
||||
r'rejestr\s+gospodarczy[:\s]*(\d{9,14})',
|
||||
]
|
||||
|
||||
regons = []
|
||||
for pattern in patterns:
|
||||
matches = re.findall(pattern, text, re.IGNORECASE)
|
||||
for match in matches:
|
||||
regon = re.sub(r'[^0-9]', '', match)
|
||||
if validate_regon(regon) and regon not in regons:
|
||||
regons.append(regon)
|
||||
|
||||
return regons
|
||||
|
||||
|
||||
def fetch_website(url: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
"""
|
||||
Pobiera zawartość strony www.
|
||||
|
||||
Returns:
|
||||
(content, error) - treść strony lub błąd
|
||||
"""
|
||||
headers = {
|
||||
'User-Agent': USER_AGENT,
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Language': 'pl-PL,pl;q=0.9,en;q=0.8',
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.get(url, headers=headers, timeout=REQUEST_TIMEOUT, allow_redirects=True)
|
||||
response.raise_for_status()
|
||||
return response.text, None
|
||||
except requests.exceptions.Timeout:
|
||||
return None, "Timeout"
|
||||
except requests.exceptions.ConnectionError:
|
||||
return None, "Connection error"
|
||||
except requests.exceptions.HTTPError as e:
|
||||
return None, f"HTTP {e.response.status_code}"
|
||||
except Exception as e:
|
||||
return None, str(e)
|
||||
|
||||
|
||||
def scan_company_website(company: Company) -> ScanResult:
|
||||
"""
|
||||
Skanuje stronę www firmy w poszukiwaniu NIP/REGON.
|
||||
"""
|
||||
# Get domain from email or website
|
||||
domain = None
|
||||
if company.website:
|
||||
parsed = urlparse(company.website if company.website.startswith('http') else f'https://{company.website}')
|
||||
domain = parsed.netloc or parsed.path.split('/')[0]
|
||||
elif company.email:
|
||||
domain = extract_domain_from_email(company.email)
|
||||
|
||||
if not domain:
|
||||
return ScanResult(
|
||||
company_id=company.id,
|
||||
company_name=company.name,
|
||||
domain="",
|
||||
url_scanned="",
|
||||
error="No domain available"
|
||||
)
|
||||
|
||||
# Clean domain
|
||||
domain = domain.lower().replace('www.', '')
|
||||
|
||||
# Try different URL variants
|
||||
urls_to_try = [
|
||||
f"https://{domain}",
|
||||
f"https://www.{domain}",
|
||||
f"https://{domain}/kontakt",
|
||||
f"https://{domain}/o-nas",
|
||||
f"https://{domain}/contact",
|
||||
f"https://{domain}/about",
|
||||
]
|
||||
|
||||
result = ScanResult(
|
||||
company_id=company.id,
|
||||
company_name=company.name,
|
||||
domain=domain,
|
||||
url_scanned=""
|
||||
)
|
||||
|
||||
all_nips = []
|
||||
all_regons = []
|
||||
|
||||
for url in urls_to_try:
|
||||
print(f" Scanning: {url}")
|
||||
content, error = fetch_website(url)
|
||||
|
||||
if error:
|
||||
continue
|
||||
|
||||
result.url_scanned = url
|
||||
|
||||
# Find NIPs and REGONs
|
||||
nips = find_nips_in_text(content)
|
||||
regons = find_regons_in_text(content)
|
||||
|
||||
all_nips.extend([n for n in nips if n not in all_nips])
|
||||
all_regons.extend([r for r in regons if r not in all_regons])
|
||||
|
||||
# If found, set confidence
|
||||
if nips or regons:
|
||||
print(f" Found NIP: {nips}, REGON: {regons}")
|
||||
break
|
||||
|
||||
time.sleep(0.5) # Rate limiting
|
||||
|
||||
# Set results
|
||||
if all_nips:
|
||||
result.nips_all = all_nips
|
||||
result.nip_found = all_nips[0] # Primary NIP
|
||||
result.confidence = "high" if len(all_nips) == 1 else "medium"
|
||||
|
||||
if all_regons:
|
||||
result.regons_all = all_regons
|
||||
result.regon_found = all_regons[0]
|
||||
|
||||
if not all_nips and not all_regons and not result.error:
|
||||
result.error = "NIP/REGON not found on website"
|
||||
result.confidence = "low"
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def get_companies_without_nip(db, company_id: int = None) -> List[Company]:
|
||||
"""Pobiera firmy bez NIP z domeną firmową"""
|
||||
query = db.query(Company).filter(
|
||||
(Company.nip == None) | (Company.nip == '')
|
||||
)
|
||||
|
||||
if company_id:
|
||||
query = query.filter(Company.id == company_id)
|
||||
|
||||
companies = query.order_by(Company.name).all()
|
||||
|
||||
# Filter out companies with public email domains
|
||||
result = []
|
||||
for c in companies:
|
||||
domain = extract_domain_from_email(c.email) if c.email else None
|
||||
if domain or c.website:
|
||||
result.append(c)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Scan websites for NIP/REGON")
|
||||
parser.add_argument('--id', type=int, help="Scan specific company ID")
|
||||
parser.add_argument('--apply', action='store_true', help="Apply found NIPs to database")
|
||||
parser.add_argument('--output', type=str, help="Output JSON file path")
|
||||
args = parser.parse_args()
|
||||
|
||||
db = SessionLocal()
|
||||
|
||||
try:
|
||||
companies = get_companies_without_nip(db, args.id)
|
||||
print(f"\n=== Skanowanie {len(companies)} firm bez NIP ===\n")
|
||||
|
||||
results = []
|
||||
found_count = 0
|
||||
|
||||
for i, company in enumerate(companies, 1):
|
||||
print(f"[{i}/{len(companies)}] {company.name}")
|
||||
|
||||
result = scan_company_website(company)
|
||||
results.append(result)
|
||||
|
||||
if result.nip_found:
|
||||
found_count += 1
|
||||
print(f" ✓ NIP: {result.nip_found} (confidence: {result.confidence})")
|
||||
|
||||
if args.apply and result.confidence in ('high', 'medium'):
|
||||
company.nip = result.nip_found
|
||||
if result.regon_found and not company.regon:
|
||||
company.regon = result.regon_found
|
||||
db.commit()
|
||||
print(f" → Zapisano do bazy")
|
||||
elif result.error:
|
||||
print(f" ✗ {result.error}")
|
||||
|
||||
time.sleep(1) # Rate limiting between companies
|
||||
|
||||
# Save results to JSON
|
||||
output_file = args.output or (RESULTS_DIR / f"scan_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json")
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump([r.to_dict() for r in results], f, ensure_ascii=False, indent=2)
|
||||
|
||||
print(f"\n=== Podsumowanie ===")
|
||||
print(f"Przeskanowano: {len(companies)} firm")
|
||||
print(f"Znaleziono NIP: {found_count}")
|
||||
print(f"Wyniki zapisane: {output_file}")
|
||||
|
||||
if found_count > 0 and not args.apply:
|
||||
print(f"\nUżyj --apply aby zapisać znalezione NIP do bazy")
|
||||
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -1220,6 +1220,12 @@
|
||||
</svg>
|
||||
</button>
|
||||
<div class="admin-dropdown-menu">
|
||||
<a href="{{ url_for('admin.admin_companies') }}">
|
||||
<svg fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
||||
<path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M19 21V5a2 2 0 00-2-2H7a2 2 0 00-2 2v16m14 0h2m-2 0h-5m-9 0H3m2 0h5M9 7h1m-1 4h1m4-4h1m-1 4h1m-5 10v-5a1 1 0 011-1h2a1 1 0 011 1v5m-4 0h4"/>
|
||||
</svg>
|
||||
Firmy
|
||||
</a>
|
||||
<a href="{{ url_for('admin.admin_users') }}">
|
||||
<svg fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
||||
<path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M12 4.354a4 4 0 110 5.292M15 21H3v-1a6 6 0 0112 0v1zm0 0h6v-1a6 6 0 00-9-5.197M13 7a4 4 0 11-8 0 4 4 0 018 0z"/>
|
||||
|
||||
@ -18,8 +18,6 @@
|
||||
padding: var(--spacing-lg);
|
||||
box-shadow: var(--shadow);
|
||||
height: fit-content;
|
||||
position: sticky;
|
||||
top: 100px;
|
||||
}
|
||||
|
||||
.konto-sidebar-header {
|
||||
|
||||
@ -18,8 +18,6 @@
|
||||
padding: var(--spacing-lg);
|
||||
box-shadow: var(--shadow);
|
||||
height: fit-content;
|
||||
position: sticky;
|
||||
top: 100px;
|
||||
}
|
||||
|
||||
.konto-sidebar-header {
|
||||
|
||||
@ -18,8 +18,6 @@
|
||||
padding: var(--spacing-lg);
|
||||
box-shadow: var(--shadow);
|
||||
height: fit-content;
|
||||
position: sticky;
|
||||
top: 100px;
|
||||
}
|
||||
|
||||
.konto-sidebar-header {
|
||||
|
||||
@ -18,8 +18,6 @@
|
||||
padding: var(--spacing-lg);
|
||||
box-shadow: var(--shadow);
|
||||
height: fit-content;
|
||||
position: sticky;
|
||||
top: 100px;
|
||||
}
|
||||
|
||||
.konto-sidebar-header {
|
||||
|
||||
Loading…
Reference in New Issue
Block a user