diff --git a/scripts/seo_report_generator.py b/scripts/seo_report_generator.py new file mode 100644 index 0000000..47a48e5 --- /dev/null +++ b/scripts/seo_report_generator.py @@ -0,0 +1,1400 @@ +#!/usr/bin/env python3 +""" +SEO Report Generator for Norda Biznes +===================================== + +Generates HTML reports (single company or batch) and JSON exports from SEO audit data. +Designed for offline viewing, sharing with clients, and archiving audit results. + +Usage: + python seo_report_generator.py --company-id 26 --html + python seo_report_generator.py --all --html --output ./reports + python seo_report_generator.py --batch 1-10 --json + python seo_report_generator.py --all --json --output ./exports + +Output: + - HTML: Styled, standalone reports suitable for viewing in browsers + - JSON: Machine-readable exports for integration with other tools + +Author: Claude Code +Date: 2026-01-08 +""" + +import os +import sys +import json +import argparse +import logging +from datetime import datetime +from typing import Optional, Dict, List, Any, Tuple +from pathlib import Path +from html import escape + +from sqlalchemy import create_engine, text +from sqlalchemy.orm import sessionmaker + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s', + datefmt='%Y-%m-%d %H:%M:%S' +) +logger = logging.getLogger(__name__) + +# Database configuration +DATABASE_URL = os.getenv( + 'DATABASE_URL', + 'postgresql://nordabiz_app:NordaBiz2025Secure@10.22.68.249:5432/nordabiz' +) + +# Report version for tracking +REPORT_VERSION = '1.0.0' + + +class SEOReportGenerator: + """ + Generates HTML and JSON reports from SEO audit data stored in database. + """ + + def __init__(self, database_url: str = DATABASE_URL): + """ + Initialize report generator. + + Args: + database_url: Database connection string. + """ + self.engine = create_engine(database_url) + self.Session = sessionmaker(bind=self.engine) + + def get_companies_with_seo_data( + self, + company_ids: Optional[List[int]] = None, + batch_start: Optional[int] = None, + batch_end: Optional[int] = None + ) -> List[Dict[str, Any]]: + """ + Fetch companies with their SEO analysis data from database. + + Args: + company_ids: List of specific company IDs to fetch. + batch_start: Start index for batch processing (1-indexed). + batch_end: End index for batch processing (1-indexed). + + Returns: + List of company dicts with SEO analysis data. + """ + with self.Session() as session: + base_query = """ + SELECT + c.id, c.name, c.slug, c.website, c.address_city, + c.nip, c.email, c.phone, + cat.name as category_name, + wa.analyzed_at, wa.website_url, wa.final_url, + wa.http_status_code, wa.load_time_ms, + wa.pagespeed_seo_score, wa.pagespeed_performance_score, + wa.pagespeed_accessibility_score, wa.pagespeed_best_practices_score, + wa.meta_title, wa.meta_description, wa.meta_keywords, + wa.h1_count, wa.h2_count, wa.h3_count, wa.h1_text, + wa.total_images, wa.images_without_alt, wa.images_with_alt, + wa.internal_links_count, wa.external_links_count, + wa.has_structured_data, wa.structured_data_types, + wa.has_canonical, wa.canonical_url, wa.is_indexable, wa.noindex_reason, + wa.has_sitemap, wa.has_robots_txt, + wa.viewport_configured, wa.is_mobile_friendly, + wa.largest_contentful_paint_ms, wa.first_input_delay_ms, wa.cumulative_layout_shift, + wa.has_og_tags, wa.og_title, wa.og_description, wa.og_image, + wa.has_twitter_cards, wa.html_lang, wa.has_hreflang, + wa.word_count_homepage, + wa.seo_audit_version, wa.seo_audited_at, wa.seo_audit_errors, + wa.seo_overall_score, wa.seo_health_score, wa.seo_issues, + wa.has_ssl, wa.ssl_expires_at + FROM companies c + LEFT JOIN company_website_analysis wa ON c.id = wa.company_id + LEFT JOIN categories cat ON c.category_id = cat.id + WHERE c.is_active = TRUE + """ + + if company_ids: + query = text(base_query + " AND c.id = ANY(:ids) ORDER BY c.id") + result = session.execute(query, {'ids': company_ids}) + elif batch_start is not None and batch_end is not None: + query = text(base_query + " ORDER BY c.id OFFSET :offset LIMIT :limit") + result = session.execute(query, { + 'offset': batch_start - 1, + 'limit': batch_end - batch_start + 1 + }) + else: + query = text(base_query + " ORDER BY c.id") + result = session.execute(query) + + companies = [] + for row in result: + company = dict(row._mapping) + # Parse JSON fields if they are strings + if company.get('seo_issues') and isinstance(company['seo_issues'], str): + try: + company['seo_issues'] = json.loads(company['seo_issues']) + except json.JSONDecodeError: + company['seo_issues'] = [] + if company.get('seo_audit_errors') and isinstance(company['seo_audit_errors'], str): + try: + company['seo_audit_errors'] = json.loads(company['seo_audit_errors']) + except json.JSONDecodeError: + company['seo_audit_errors'] = [] + companies.append(company) + + return companies + + def generate_html_report( + self, + company: Dict[str, Any], + include_recommendations: bool = True + ) -> str: + """ + Generate HTML report for a single company. + + Args: + company: Company data dict with SEO analysis. + include_recommendations: Whether to include improvement recommendations. + + Returns: + HTML string of the complete report. + """ + # Escape HTML in all string values + def safe(value): + if value is None: + return '' + return escape(str(value)) + + # Score color helper + def score_color(score): + if score is None: + return '#6c757d' # gray + if score >= 90: + return '#28a745' # green + if score >= 50: + return '#ffc107' # yellow + return '#dc3545' # red + + def score_label(score): + if score is None: + return 'Brak danych' + if score >= 90: + return 'Doskonały' + if score >= 70: + return 'Dobry' + if score >= 50: + return 'Średni' + return 'Wymaga poprawy' + + # Generate recommendations based on issues + recommendations = [] + if include_recommendations: + recommendations = self._generate_recommendations(company) + + # Build HTML + html = f''' + + + + + Raport SEO - {safe(company.get('name'))} + + + +
+
+

{safe(company.get('name'))}

+
+
Raport SEO wygenerowany: {datetime.now().strftime('%d.%m.%Y %H:%M')}
+
Strona: {safe(company.get('website') or company.get('website_url') or 'Brak')}
+ {f'
Kategoria: {safe(company.get("category_name"))}
' if company.get('category_name') else ''} +
+
+ +
+ +
+

Wyniki SEO

+
+
+
Ogolny wynik SEO
+
+ {company.get('seo_overall_score') if company.get('seo_overall_score') is not None else '—'} +
+
{score_label(company.get('seo_overall_score'))}
+
+
+
PageSpeed SEO
+
+ {company.get('pagespeed_seo_score') if company.get('pagespeed_seo_score') is not None else '—'} +
+
{score_label(company.get('pagespeed_seo_score'))}
+
+
+
Wydajnosc
+
+ {company.get('pagespeed_performance_score') if company.get('pagespeed_performance_score') is not None else '—'} +
+
{score_label(company.get('pagespeed_performance_score'))}
+
+
+
Dostepnosc
+
+ {company.get('pagespeed_accessibility_score') if company.get('pagespeed_accessibility_score') is not None else '—'} +
+
{score_label(company.get('pagespeed_accessibility_score'))}
+
+
+
+ + +
+

Szczegoly techniczne

+
+
+

Meta tagi

+
+ Tytul strony + + {self._truncate(safe(company.get('meta_title')), 40) or '—'} + +
+
+ Opis meta + + {f'Tak ({len(company.get("meta_description") or "")} zn.)' if company.get('meta_description') else 'Brak'} + +
+
+ Canonical URL + + {'Tak' if company.get('has_canonical') else 'Nie'} + +
+
+
+

Struktura naglowkow

+
+ H1 + + {self._h1_badge(company.get('h1_count'))} + +
+
+ H2 + {company.get('h2_count') if company.get('h2_count') is not None else '—'} +
+
+ H3 + {company.get('h3_count') if company.get('h3_count') is not None else '—'} +
+
+
+

Obrazy

+
+ Liczba obrazow + {company.get('total_images') if company.get('total_images') is not None else '—'} +
+
+ Bez alt + + {self._images_alt_badge(company.get('images_without_alt'), company.get('total_images'))} + +
+
+ Z alt + {company.get('images_with_alt') if company.get('images_with_alt') is not None else '—'} +
+
+
+

Linki

+
+ Wewnetrzne + {company.get('internal_links_count') if company.get('internal_links_count') is not None else '—'} +
+
+ Zewnetrzne + {company.get('external_links_count') if company.get('external_links_count') is not None else '—'} +
+
+
+
+ + +
+

Techniczne SEO

+
+
+

Pliki i indeksowanie

+
+ robots.txt + + {'Tak' if company.get('has_robots_txt') else 'Nie'} + +
+
+ sitemap.xml + + {'Tak' if company.get('has_sitemap') else 'Nie'} + +
+
+ Indeksowalnosc + + {'Tak' if company.get('is_indexable') else f'Nie ({safe(company.get("noindex_reason") or "")})'} + +
+
+
+

Bezpieczenstwo i mobilnosc

+
+ SSL/HTTPS + + {'Tak' if company.get('has_ssl') else 'Nie'} + +
+
+ Viewport + + {'Tak' if company.get('viewport_configured') else 'Nie'} + +
+
+ Mobile-friendly + + {'Tak' if company.get('is_mobile_friendly') else 'Nie'} + +
+
+
+

Dane strukturalne

+
+ Schema.org + + {'Tak' if company.get('has_structured_data') else 'Nie'} + +
+
+ Typy + + {', '.join(company.get('structured_data_types') or []) or '—'} + +
+
+
+

Social Media

+
+ Open Graph + + {'Tak' if company.get('has_og_tags') else 'Nie'} + +
+
+ Twitter Cards + + {'Tak' if company.get('has_twitter_cards') else 'Nie'} + +
+
+ Jezyk (lang) + {safe(company.get('html_lang')) or '—'} +
+
+
+
+ + + {self._core_web_vitals_section(company)} + + + {self._issues_section(company)} + + + {self._recommendations_section(recommendations) if recommendations else ''} + +
+ + +
+ +''' + + return html + + def _truncate(self, text: str, length: int) -> str: + """Truncate text with ellipsis.""" + if not text: + return '' + if len(text) <= length: + return text + return text[:length] + '...' + + def _h1_badge(self, count: Optional[int]) -> str: + """Generate badge for H1 count.""" + if count is None: + return '' + if count == 1: + return f'{count}' + if count == 0: + return '0 (brak!)' + return f'{count} (za duzo)' + + def _images_alt_badge(self, without_alt: Optional[int], total: Optional[int]) -> str: + """Generate badge for images without alt.""" + if without_alt is None: + return '' + if without_alt == 0: + return '0' + if total and without_alt / total > 0.5: + return f'{without_alt}' + return f'{without_alt}' + + def _core_web_vitals_section(self, company: Dict[str, Any]) -> str: + """Generate Core Web Vitals section HTML.""" + lcp = company.get('largest_contentful_paint_ms') + fid = company.get('first_input_delay_ms') + cls = company.get('cumulative_layout_shift') + + if lcp is None and fid is None and cls is None: + return '' + + def lcp_status(val): + if val is None: + return ('—', 'badge-secondary') + if val <= 2500: + return (f'{val}ms', 'badge-success') + if val <= 4000: + return (f'{val}ms', 'badge-warning') + return (f'{val}ms', 'badge-danger') + + def fid_status(val): + if val is None: + return ('—', 'badge-secondary') + if val <= 100: + return (f'{val}ms', 'badge-success') + if val <= 300: + return (f'{val}ms', 'badge-warning') + return (f'{val}ms', 'badge-danger') + + def cls_status(val): + if val is None: + return ('—', 'badge-secondary') + if val <= 0.1: + return (f'{val:.3f}', 'badge-success') + if val <= 0.25: + return (f'{val:.3f}', 'badge-warning') + return (f'{val:.3f}', 'badge-danger') + + lcp_val, lcp_class = lcp_status(lcp) + fid_val, fid_class = fid_status(fid) + cls_val, cls_class = cls_status(cls) + + return f''' +
+

Core Web Vitals

+
+
+

LCP (Largest Contentful Paint)

+
+ Wynik + {lcp_val} +
+
+ Cel + < 2500ms +
+
+
+

FID (First Input Delay)

+
+ Wynik + {fid_val} +
+
+ Cel + < 100ms +
+
+
+

CLS (Cumulative Layout Shift)

+
+ Wynik + {cls_val} +
+
+ Cel + < 0.1 +
+
+
+
+ ''' + + def _issues_section(self, company: Dict[str, Any]) -> str: + """Generate issues section HTML.""" + issues = company.get('seo_issues') or [] + errors = company.get('seo_audit_errors') or [] + + if not issues and not errors: + return '' + + items_html = '' + for issue in issues: + if isinstance(issue, dict): + severity = issue.get('severity', 'info') + message = escape(issue.get('message', '')) + else: + severity = 'info' + message = escape(str(issue)) + items_html += f'
  • {message}
  • \n' + + for error in errors: + items_html += f'
  • {escape(str(error))}
  • \n' + + return f''' +
    +

    Wykryte problemy

    + +
    + ''' + + def _recommendations_section(self, recommendations: List[str]) -> str: + """Generate recommendations section HTML.""" + if not recommendations: + return '' + + items_html = ''.join(f'
  • {escape(rec)}
  • \n' for rec in recommendations) + + return f''' +
    +

    Rekomendacje

    +
    +
      + {items_html} +
    +
    +
    + ''' + + def _generate_recommendations(self, company: Dict[str, Any]) -> List[str]: + """Generate SEO improvement recommendations based on audit data.""" + recommendations = [] + + # Meta tags + if not company.get('meta_title'): + recommendations.append( + 'Dodaj znacznik do strony. Powinien miec 50-60 znakow i zawierac slowa kluczowe.' + ) + if not company.get('meta_description'): + recommendations.append( + 'Dodaj meta description (150-160 znakow). Dobry opis zwieksza CTR w wynikach wyszukiwania.' + ) + + # Headings + h1_count = company.get('h1_count') + if h1_count == 0: + recommendations.append( + 'Dodaj naglowek H1 do strony. Kazda strona powinna miec dokladnie jeden H1.' + ) + elif h1_count and h1_count > 1: + recommendations.append( + f'Strona ma {h1_count} naglowkow H1. Pozostaw tylko jeden glowny naglowek H1.' + ) + + # Images + images_without_alt = company.get('images_without_alt') + if images_without_alt and images_without_alt > 0: + recommendations.append( + f'Dodaj atrybuty alt do {images_without_alt} obrazow. Alt poprawia SEO i dostepnosc.' + ) + + # Technical SEO + if not company.get('has_robots_txt'): + recommendations.append( + 'Utworz plik robots.txt w glownym katalogu strony.' + ) + if not company.get('has_sitemap'): + recommendations.append( + 'Utworz i zglos mape strony (sitemap.xml) w Google Search Console.' + ) + if not company.get('has_canonical'): + recommendations.append( + 'Dodaj znacznik canonical URL aby uniknac problemow z duplikacja tresci.' + ) + if not company.get('has_ssl'): + recommendations.append( + 'Wlacz certyfikat SSL (HTTPS). Google premiuje strony z bezpiecznym polaczeniem.' + ) + + # Mobile + if not company.get('viewport_configured'): + recommendations.append( + 'Dodaj znacznik viewport meta dla prawidlowego wyswietlania na urzadzeniach mobilnych.' + ) + + # Structured data + if not company.get('has_structured_data'): + recommendations.append( + 'Dodaj dane strukturalne (Schema.org) - np. LocalBusiness dla lepszej widocznosci w Google.' + ) + + # Open Graph + if not company.get('has_og_tags'): + recommendations.append( + 'Dodaj znaczniki Open Graph dla lepszego wygladu przy udostepnianiu w mediach spolecznosciowych.' + ) + + # Performance + lcp = company.get('largest_contentful_paint_ms') + if lcp and lcp > 2500: + recommendations.append( + f'Popraw LCP (obecnie {lcp}ms). Zoptymalizuj obrazy i skrypty dla szybszego ladowania.' + ) + + cls = company.get('cumulative_layout_shift') + if cls and cls > 0.1: + recommendations.append( + f'Popraw CLS (obecnie {cls:.3f}). Zdefiniuj wymiary obrazow i unikaj dynamicznego dodawania tresci.' + ) + + return recommendations + + def generate_batch_html_report( + self, + companies: List[Dict[str, Any]], + title: str = "Raport SEO - Norda Biznes" + ) -> str: + """ + Generate batch HTML report summarizing multiple companies. + + Args: + companies: List of company data dicts with SEO analysis. + title: Report title. + + Returns: + HTML string of the batch summary report. + """ + def safe(value): + if value is None: + return '' + return escape(str(value)) + + def score_color(score): + if score is None: + return '#6c757d' + if score >= 90: + return '#28a745' + if score >= 50: + return '#ffc107' + return '#dc3545' + + # Calculate statistics + total = len(companies) + audited = sum(1 for c in companies if c.get('seo_audited_at')) + scores = [c.get('seo_overall_score') for c in companies if c.get('seo_overall_score') is not None] + avg_score = sum(scores) / len(scores) if scores else 0 + excellent = sum(1 for s in scores if s >= 90) + good = sum(1 for s in scores if 70 <= s < 90) + fair = sum(1 for s in scores if 50 <= s < 70) + poor = sum(1 for s in scores if s < 50) + + # Generate table rows + rows_html = '' + for company in sorted(companies, key=lambda c: c.get('seo_overall_score') or 0, reverse=True): + overall = company.get('seo_overall_score') + perf = company.get('pagespeed_performance_score') + seo = company.get('pagespeed_seo_score') + acc = company.get('pagespeed_accessibility_score') + + rows_html += f''' + <tr> + <td><strong>{safe(company.get('name'))}</strong></td> + <td>{safe(company.get('category_name') or '—')}</td> + <td style="color: {score_color(overall)}; font-weight: bold;">{overall if overall is not None else '—'}</td> + <td style="color: {score_color(seo)};">{seo if seo is not None else '—'}</td> + <td style="color: {score_color(perf)};">{perf if perf is not None else '—'}</td> + <td style="color: {score_color(acc)};">{acc if acc is not None else '—'}</td> + <td>{'<span class="badge badge-success">Tak</span>' if company.get('has_ssl') else '<span class="badge badge-danger">Nie</span>'}</td> + <td>{'<span class="badge badge-success">Tak</span>' if company.get('is_mobile_friendly') else '<span class="badge badge-warning">Nie</span>'}</td> + </tr> + ''' + + html = f'''<!DOCTYPE html> +<html lang="pl"> +<head> + <meta charset="UTF-8"> + <meta name="viewport" content="width=device-width, initial-scale=1.0"> + <title>{safe(title)} + + + +
    +
    +

    {safe(title)}

    +
    Wygenerowano: {datetime.now().strftime('%d.%m.%Y %H:%M')}
    +
    +
    +
    +
    +
    {total}
    +
    Firm w raporcie
    +
    +
    +
    {audited}
    +
    Przebadanych
    +
    +
    +
    {avg_score:.1f}
    +
    Sredni wynik
    +
    +
    +
    {excellent}
    +
    Doskonaly (90+)
    +
    +
    +
    {good}
    +
    Dobry (70-89)
    +
    +
    +
    {fair}
    +
    Sredni (50-69)
    +
    +
    +
    {poor}
    +
    Slaby (<50)
    +
    +
    + +

    Wyniki poszczegolnych firm

    + + + + + + + + + + + + + + + {rows_html} + +
    FirmaKategoriaWynik SEOPS SEOWydajnoscDostepnoscSSLMobile
    +
    + +
    + +''' + + return html + + def generate_json_export( + self, + companies: List[Dict[str, Any]], + include_raw_data: bool = False + ) -> Dict[str, Any]: + """ + Generate JSON export of SEO audit data. + + Args: + companies: List of company data dicts. + include_raw_data: Whether to include all raw data fields. + + Returns: + Dict ready for JSON serialization. + """ + export = { + 'report_version': REPORT_VERSION, + 'generated_at': datetime.now().isoformat(), + 'total_companies': len(companies), + 'audited_companies': sum(1 for c in companies if c.get('seo_audited_at')), + 'statistics': self._calculate_statistics(companies), + 'companies': [] + } + + for company in companies: + company_data = { + 'id': company.get('id'), + 'name': company.get('name'), + 'slug': company.get('slug'), + 'website': company.get('website') or company.get('website_url'), + 'category': company.get('category_name'), + 'nip': company.get('nip'), + 'city': company.get('address_city'), + 'seo_audit': { + 'audited_at': company.get('seo_audited_at').isoformat() if company.get('seo_audited_at') else None, + 'audit_version': company.get('seo_audit_version'), + 'overall_score': company.get('seo_overall_score'), + 'health_score': company.get('seo_health_score'), + } + } + + if company.get('seo_audited_at'): + company_data['seo_audit']['pagespeed'] = { + 'seo_score': company.get('pagespeed_seo_score'), + 'performance_score': company.get('pagespeed_performance_score'), + 'accessibility_score': company.get('pagespeed_accessibility_score'), + 'best_practices_score': company.get('pagespeed_best_practices_score'), + } + company_data['seo_audit']['on_page'] = { + 'meta_title': company.get('meta_title'), + 'meta_description': company.get('meta_description'), + 'h1_count': company.get('h1_count'), + 'h2_count': company.get('h2_count'), + 'h3_count': company.get('h3_count'), + 'total_images': company.get('total_images'), + 'images_without_alt': company.get('images_without_alt'), + 'internal_links': company.get('internal_links_count'), + 'external_links': company.get('external_links_count'), + 'has_structured_data': company.get('has_structured_data'), + 'structured_data_types': company.get('structured_data_types'), + } + company_data['seo_audit']['technical'] = { + 'has_ssl': company.get('has_ssl'), + 'has_sitemap': company.get('has_sitemap'), + 'has_robots_txt': company.get('has_robots_txt'), + 'has_canonical': company.get('has_canonical'), + 'is_indexable': company.get('is_indexable'), + 'is_mobile_friendly': company.get('is_mobile_friendly'), + 'viewport_configured': company.get('viewport_configured'), + 'http_status': company.get('http_status_code'), + 'load_time_ms': company.get('load_time_ms'), + } + company_data['seo_audit']['core_web_vitals'] = { + 'lcp_ms': company.get('largest_contentful_paint_ms'), + 'fid_ms': company.get('first_input_delay_ms'), + 'cls': float(company.get('cumulative_layout_shift')) if company.get('cumulative_layout_shift') else None, + } + company_data['seo_audit']['social'] = { + 'has_og_tags': company.get('has_og_tags'), + 'og_title': company.get('og_title'), + 'has_twitter_cards': company.get('has_twitter_cards'), + } + company_data['seo_audit']['issues'] = company.get('seo_issues') or [] + company_data['seo_audit']['errors'] = company.get('seo_audit_errors') or [] + + export['companies'].append(company_data) + + return export + + def _calculate_statistics(self, companies: List[Dict[str, Any]]) -> Dict[str, Any]: + """Calculate summary statistics from company data.""" + scores = [c.get('seo_overall_score') for c in companies if c.get('seo_overall_score') is not None] + perf_scores = [c.get('pagespeed_performance_score') for c in companies if c.get('pagespeed_performance_score') is not None] + + stats = { + 'seo_scores': { + 'average': round(sum(scores) / len(scores), 1) if scores else None, + 'min': min(scores) if scores else None, + 'max': max(scores) if scores else None, + 'count': len(scores), + 'distribution': { + 'excellent_90_100': sum(1 for s in scores if s >= 90), + 'good_70_89': sum(1 for s in scores if 70 <= s < 90), + 'fair_50_69': sum(1 for s in scores if 50 <= s < 70), + 'poor_0_49': sum(1 for s in scores if s < 50), + } + }, + 'performance_scores': { + 'average': round(sum(perf_scores) / len(perf_scores), 1) if perf_scores else None, + 'count': len(perf_scores), + }, + 'technical': { + 'with_ssl': sum(1 for c in companies if c.get('has_ssl')), + 'with_sitemap': sum(1 for c in companies if c.get('has_sitemap')), + 'with_robots_txt': sum(1 for c in companies if c.get('has_robots_txt')), + 'mobile_friendly': sum(1 for c in companies if c.get('is_mobile_friendly')), + 'with_structured_data': sum(1 for c in companies if c.get('has_structured_data')), + 'with_og_tags': sum(1 for c in companies if c.get('has_og_tags')), + }, + } + + return stats + + def save_html_report( + self, + html: str, + output_path: str + ) -> str: + """ + Save HTML report to file. + + Args: + html: HTML content string. + output_path: Path to save file. + + Returns: + Full path to saved file. + """ + path = Path(output_path) + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(html, encoding='utf-8') + return str(path.absolute()) + + def save_json_export( + self, + data: Dict[str, Any], + output_path: str + ) -> str: + """ + Save JSON export to file. + + Args: + data: Data dict to serialize. + output_path: Path to save file. + + Returns: + Full path to saved file. + """ + path = Path(output_path) + path.parent.mkdir(parents=True, exist_ok=True) + + def json_serializer(obj): + if hasattr(obj, 'isoformat'): + return obj.isoformat() + if hasattr(obj, '__float__'): + return float(obj) + raise TypeError(f'Object of type {type(obj)} is not JSON serializable') + + with open(path, 'w', encoding='utf-8') as f: + json.dump(data, f, indent=2, ensure_ascii=False, default=json_serializer) + + return str(path.absolute()) + + +def parse_batch_argument(batch_str: str) -> Tuple[int, int]: + """Parse batch argument in format 'START-END'.""" + if '-' not in batch_str: + raise ValueError(f"Invalid batch format '{batch_str}'. Use START-END (e.g., 1-10)") + + parts = batch_str.split('-') + if len(parts) != 2: + raise ValueError(f"Invalid batch format '{batch_str}'. Use START-END (e.g., 1-10)") + + try: + start = int(parts[0].strip()) + end = int(parts[1].strip()) + except ValueError: + raise ValueError(f"Invalid batch values '{batch_str}'. START and END must be numbers") + + if start < 1: + raise ValueError(f"Invalid batch start '{start}'. Must be >= 1") + if end < start: + raise ValueError(f"Invalid batch range '{start}-{end}'. END must be >= START") + + return start, end + + +def main(): + """Main entry point for CLI usage.""" + parser = argparse.ArgumentParser( + description='Generate SEO reports from Norda Biznes audit data', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python seo_report_generator.py --company-id 26 --html + python seo_report_generator.py --all --html --output ./reports + python seo_report_generator.py --batch 1-10 --json + python seo_report_generator.py --all --json --output ./exports + python seo_report_generator.py --all --html --json --output ./output + +Output formats: + --html Generate styled HTML reports + --json Generate JSON exports for integration + +File naming: + Single company HTML: seo_report_{slug}.html + Batch HTML summary: seo_report_batch_{timestamp}.html + JSON export: seo_export_{timestamp}.json + """ + ) + + # Selection arguments + selection = parser.add_argument_group('Company Selection (choose one)') + selection.add_argument('--company-id', type=int, metavar='ID', + help='Generate report for single company by ID') + selection.add_argument('--company-ids', type=str, metavar='IDS', + help='Generate reports for multiple companies (comma-separated IDs)') + selection.add_argument('--batch', type=str, metavar='RANGE', + help='Generate reports for batch of companies (e.g., 1-10)') + selection.add_argument('--all', action='store_true', + help='Generate reports for all companies') + + # Output format arguments + output_group = parser.add_argument_group('Output Format') + output_group.add_argument('--html', action='store_true', + help='Generate HTML reports') + output_group.add_argument('--json', action='store_true', + help='Generate JSON export') + + # Options + options = parser.add_argument_group('Options') + options.add_argument('--output', '-o', type=str, metavar='DIR', default='.', + help='Output directory (default: current directory)') + options.add_argument('--no-recommendations', action='store_true', + help='Exclude recommendations from HTML reports') + options.add_argument('--batch-summary', action='store_true', + help='Generate batch summary HTML instead of individual reports') + options.add_argument('--verbose', '-v', action='store_true', + help='Verbose output') + options.add_argument('--database-url', type=str, metavar='URL', + help='Database connection URL') + + args = parser.parse_args() + + # Configure logging + if args.verbose: + logging.getLogger().setLevel(logging.DEBUG) + + # Validate selection + selection_count = sum([ + args.company_id is not None, + args.company_ids is not None, + args.batch is not None, + args.all + ]) + + if selection_count == 0: + parser.print_help() + print("\nError: Please specify one of --company-id, --company-ids, --batch, or --all") + sys.exit(1) + + if selection_count > 1: + print("Error: Please specify only one selection method") + sys.exit(1) + + # Validate output format + if not args.html and not args.json: + parser.print_help() + print("\nError: Please specify at least one output format: --html or --json") + sys.exit(1) + + # Parse selection arguments + company_ids = None + batch_start, batch_end = None, None + + if args.company_id: + company_ids = [args.company_id] + elif args.company_ids: + try: + company_ids = [int(x.strip()) for x in args.company_ids.split(',')] + except ValueError: + print("Error: Invalid --company-ids format. Use comma-separated integers") + sys.exit(1) + elif args.batch: + try: + batch_start, batch_end = parse_batch_argument(args.batch) + except ValueError as e: + print(f"Error: {e}") + sys.exit(1) + + # Initialize generator + database_url = args.database_url or DATABASE_URL + try: + generator = SEOReportGenerator(database_url=database_url) + except Exception as e: + logger.error(f"Failed to initialize: {e}") + print(f"Error: {e}") + sys.exit(1) + + # Fetch data + logger.info("Fetching company data from database...") + try: + companies = generator.get_companies_with_seo_data( + company_ids=company_ids, + batch_start=batch_start, + batch_end=batch_end + ) + except Exception as e: + logger.error(f"Failed to fetch data: {e}") + print(f"Error fetching data: {e}") + sys.exit(1) + + if not companies: + print("No companies found matching the criteria") + sys.exit(1) + + logger.info(f"Found {len(companies)} companies") + + # Create output directory + output_dir = Path(args.output) + output_dir.mkdir(parents=True, exist_ok=True) + + timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') + generated_files = [] + + # Generate HTML reports + if args.html: + if args.batch_summary or len(companies) > 1: + # Generate batch summary + logger.info("Generating batch HTML summary...") + html = generator.generate_batch_html_report(companies) + filename = f"seo_report_batch_{timestamp}.html" + filepath = generator.save_html_report(html, output_dir / filename) + generated_files.append(filepath) + logger.info(f"Saved: {filepath}") + + # Also generate individual reports if not only summary + if not args.batch_summary: + for company in companies: + slug = company.get('slug', f"company_{company.get('id')}") + html = generator.generate_html_report( + company, + include_recommendations=not args.no_recommendations + ) + filename = f"seo_report_{slug}.html" + filepath = generator.save_html_report(html, output_dir / filename) + generated_files.append(filepath) + logger.debug(f"Saved: {filepath}") + + logger.info(f"Generated {len(companies)} individual HTML reports") + else: + # Single company report + company = companies[0] + slug = company.get('slug', f"company_{company.get('id')}") + html = generator.generate_html_report( + company, + include_recommendations=not args.no_recommendations + ) + filename = f"seo_report_{slug}.html" + filepath = generator.save_html_report(html, output_dir / filename) + generated_files.append(filepath) + logger.info(f"Saved: {filepath}") + + # Generate JSON export + if args.json: + logger.info("Generating JSON export...") + data = generator.generate_json_export(companies) + filename = f"seo_export_{timestamp}.json" + filepath = generator.save_json_export(data, output_dir / filename) + generated_files.append(filepath) + logger.info(f"Saved: {filepath}") + + # Summary + print("\n" + "=" * 60) + print("REPORT GENERATION COMPLETE") + print("=" * 60) + print(f"Companies processed: {len(companies)}") + print(f"Files generated: {len(generated_files)}") + print(f"Output directory: {output_dir.absolute()}") + print("\nGenerated files:") + for f in generated_files: + print(f" - {f}") + print("=" * 60) + + +if __name__ == '__main__': + main()