auto-claude: 3.2 - Add TechnicalSEOChecker class to scripts/seo_analyzer.py

Adds TechnicalSEOChecker class that performs technical SEO audits: - robots.txt: checks existence, parses directives (Disallow, Allow, Sitemap) detects if blocks Googlebot or all bots - sitemap.xml: checks existence, validates XML, counts URLs, detects sitemap index - Canonical URLs: detects canonical tag, checks if self-referencing or cross-domain - Noindex tags: checks meta robots and X-Robots-Tag HTTP header - Redirect chains: follows up to 10 redirects, detects loops, HTTPS upgrades, www redirects, and mixed content issues Includes: - 8 dataclasses for structured results (RobotsTxtResult, SitemapResult, etc.) - TechnicalSEOResult container for complete analysis - check_technical_seo() convenience function - CLI support: --technical/-t flag for technical-only analysis - --all/-a flag for combined on-page and technical analysis - --json/-j flag for JSON output 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-08 02:12:47 +01:00 · 2026-01-08 02:12:47 +01:00 · 81fc27dfa9
commit 81fc27dfa9
parent 0c257f5e48
1 changed files with 824 additions and 62 deletions
--- a/scripts/seo_analyzer.py
+++ b/scripts/seo_analyzer.py
@ -11,12 +11,24 @@ Analyzes HTML content for SEO factors including:
 - Structured data detection (JSON-LD, Microdata, RDFa)
 - Open Graph and Twitter Card metadata
-Usage:
+Also includes TechnicalSEOChecker for:
-    from seo_analyzer import OnPageSEOAnalyzer
+- robots.txt analysis
 - sitemap.xml validation
 - Canonical URL verification
 - Noindex tag detection
 - Redirect chain analysis
 Usage:
    from seo_analyzer import OnPageSEOAnalyzer, TechnicalSEOChecker
    # On-page analysis
    analyzer = OnPageSEOAnalyzer()
    result = analyzer.analyze_html(html_content, base_url='https://example.com')
    # Technical SEO checks
    checker = TechnicalSEOChecker()
    tech_result = checker.check_url('https://example.com')
 Author: Claude Code
 Date: 2026-01-08
 """
@ -24,10 +36,13 @@ Date: 2026-01-08
 import json
 import re
 import logging
 import time
 import xml.etree.ElementTree as ET
 from typing import Optional, Dict, List, Any, Tuple
 from dataclasses import dataclass, field, asdict
 from urllib.parse import urlparse, urljoin
 import requests
 from bs4 import BeautifulSoup, Comment
 # Configure logging
@ -750,6 +765,631 @@ class OnPageSEOAnalyzer:
        return 0
 # =============================================================================
 # Technical SEO Checker
 # =============================================================================
 # Request configuration for TechnicalSEOChecker
 REQUEST_TIMEOUT = 15
 USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 NordaBiznes-SEO-Checker/1.0'
 # Maximum redirects to follow
 MAX_REDIRECTS = 10
@dataclass
 class RobotsTxtResult:
    """Analysis of robots.txt file."""
    exists: bool = False
    url: Optional[str] = None
    status_code: Optional[int] = None
    content: Optional[str] = None
    content_length: Optional[int] = None
    disallow_rules: List[str] = field(default_factory=list)
    allow_rules: List[str] = field(default_factory=list)
    sitemap_urls: List[str] = field(default_factory=list)
    crawl_delay: Optional[float] = None
    blocks_googlebot: bool = False
    blocks_all_bots: bool = False
    errors: List[str] = field(default_factory=list)
    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)
@dataclass
 class SitemapResult:
    """Analysis of sitemap.xml file."""
    exists: bool = False
    url: Optional[str] = None
    status_code: Optional[int] = None
    is_valid_xml: bool = False
    is_sitemap_index: bool = False
    url_count: int = 0
    sitemap_count: int = 0  # For sitemap index
    sample_urls: List[str] = field(default_factory=list)
    last_modified: Optional[str] = None
    content_length: Optional[int] = None
    errors: List[str] = field(default_factory=list)
    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)
@dataclass
 class RedirectInfo:
    """Information about a single redirect."""
    from_url: str
    to_url: str
    status_code: int
    is_https_upgrade: bool = False
    is_www_redirect: bool = False
    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)
@dataclass
 class RedirectChainResult:
    """Analysis of redirect chain for a URL."""
    original_url: str
    final_url: str
    chain_length: int = 0
    redirects: List[RedirectInfo] = field(default_factory=list)
    has_redirect_loop: bool = False
    has_mixed_content: bool = False  # HTTP -> HTTPS -> HTTP
    total_time_ms: Optional[int] = None
    errors: List[str] = field(default_factory=list)
    def to_dict(self) -> Dict[str, Any]:
        result = asdict(self)
        result['redirects'] = [r.to_dict() if hasattr(r, 'to_dict') else r for r in self.redirects]
        return result
@dataclass
 class CanonicalResult:
    """Analysis of canonical URL configuration."""
    has_canonical: bool = False
    canonical_url: Optional[str] = None
    is_self_referencing: bool = False
    points_to_different_domain: bool = False
    is_relative: bool = False
    is_valid_url: bool = False
    matches_current_url: bool = False
    errors: List[str] = field(default_factory=list)
    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)
@dataclass
 class IndexabilityResult:
    """Analysis of page indexability."""
    is_indexable: bool = True
    has_noindex_meta: bool = False
    has_noindex_header: bool = False
    noindex_source: Optional[str] = None  # 'meta', 'header', 'robots.txt'
    meta_robots_content: Optional[str] = None
    x_robots_tag: Optional[str] = None
    errors: List[str] = field(default_factory=list)
    def to_dict(self) -> Dict[str, Any]:
        return asdict(self)
@dataclass
 class TechnicalSEOResult:
    """Complete technical SEO check result."""
    url: str
    checked_at: str
    robots_txt: RobotsTxtResult
    sitemap: SitemapResult
    redirect_chain: RedirectChainResult
    canonical: CanonicalResult
    indexability: IndexabilityResult
    errors: List[str] = field(default_factory=list)
    def to_dict(self) -> Dict[str, Any]:
        return {
            'url': self.url,
            'checked_at': self.checked_at,
            'robots_txt': self.robots_txt.to_dict(),
            'sitemap': self.sitemap.to_dict(),
            'redirect_chain': self.redirect_chain.to_dict(),
            'canonical': self.canonical.to_dict(),
            'indexability': self.indexability.to_dict(),
            'errors': self.errors,
        }
 class TechnicalSEOChecker:
    """
    Checks technical SEO factors for a website.
    Analyzes:
    - robots.txt presence and configuration
    - sitemap.xml presence and validity
    - Canonical URL configuration
    - Noindex tags (meta and HTTP header)
    - Redirect chains
    Usage:
        checker = TechnicalSEOChecker()
        result = checker.check_url('https://example.com')
        # Access specific results
        print(f"robots.txt exists: {result.robots_txt.exists}")
        print(f"sitemap.xml exists: {result.sitemap.exists}")
        print(f"Redirect chain length: {result.redirect_chain.chain_length}")
        print(f"Is indexable: {result.indexability.is_indexable}")
    """
    def __init__(self, timeout: int = REQUEST_TIMEOUT):
        """
        Initialize the TechnicalSEOChecker.
        Args:
            timeout: Request timeout in seconds.
        """
        self.timeout = timeout
        self.session = requests.Session()
        self.session.headers.update({'User-Agent': USER_AGENT})
    def check_url(self, url: str) -> TechnicalSEOResult:
        """
        Perform complete technical SEO check for a URL.
        Args:
            url: The URL to check.
        Returns:
            TechnicalSEOResult with all technical SEO analysis.
        """
        from datetime import datetime
        errors = []
        # Normalize URL
        if not url.startswith(('http://', 'https://')):
            url = 'https://' + url
        parsed = urlparse(url)
        base_url = f"{parsed.scheme}://{parsed.netloc}"
        # Check robots.txt
        robots_result = self.check_robots_txt(base_url)
        # Check sitemap.xml (use sitemap from robots.txt if available)
        sitemap_urls = robots_result.sitemap_urls if robots_result.sitemap_urls else [f"{base_url}/sitemap.xml"]
        sitemap_result = self.check_sitemap(sitemap_urls[0] if sitemap_urls else f"{base_url}/sitemap.xml")
        # Check redirect chain
        redirect_result = self.check_redirect_chain(url)
        # Fetch page for canonical and indexability checks
        canonical_result = CanonicalResult()
        indexability_result = IndexabilityResult()
        try:
            response = self.session.get(url, timeout=self.timeout, allow_redirects=True)
            final_url = response.url
            # Parse HTML for canonical and noindex
            if response.status_code == 200:
                canonical_result = self._check_canonical(response.text, final_url)
                indexability_result = self._check_indexability(response)
            else:
                errors.append(f"HTTP {response.status_code} when fetching page")
        except requests.exceptions.Timeout:
            errors.append(f"Timeout fetching {url}")
        except requests.exceptions.ConnectionError as e:
            errors.append(f"Connection error: {str(e)[:100]}")
        except requests.exceptions.RequestException as e:
            errors.append(f"Request error: {str(e)[:100]}")
        return TechnicalSEOResult(
            url=url,
            checked_at=datetime.now().isoformat(),
            robots_txt=robots_result,
            sitemap=sitemap_result,
            redirect_chain=redirect_result,
            canonical=canonical_result,
            indexability=indexability_result,
            errors=errors,
        )
    def check_robots_txt(self, base_url: str) -> RobotsTxtResult:
        """
        Check robots.txt file for a domain.
        Args:
            base_url: Base URL of the site (e.g., 'https://example.com').
        Returns:
            RobotsTxtResult with robots.txt analysis.
        """
        result = RobotsTxtResult()
        robots_url = f"{base_url.rstrip('/')}/robots.txt"
        result.url = robots_url
        try:
            response = self.session.get(robots_url, timeout=self.timeout)
            result.status_code = response.status_code
            if response.status_code == 200:
                result.exists = True
                result.content = response.text
                result.content_length = len(response.text)
                # Parse robots.txt
                self._parse_robots_txt(response.text, result)
            elif response.status_code == 404:
                result.exists = False
            else:
                result.errors.append(f"Unexpected status code: {response.status_code}")
        except requests.exceptions.Timeout:
            result.errors.append("Timeout fetching robots.txt")
        except requests.exceptions.ConnectionError as e:
            result.errors.append(f"Connection error: {str(e)[:100]}")
        except requests.exceptions.RequestException as e:
            result.errors.append(f"Request error: {str(e)[:100]}")
        return result
    def _parse_robots_txt(self, content: str, result: RobotsTxtResult) -> None:
        """Parse robots.txt content and populate result."""
        current_user_agent = None
        is_googlebot_section = False
        is_all_section = False
        for line in content.split('\n'):
            line = line.strip()
            # Skip empty lines and comments
            if not line or line.startswith('#'):
                continue
            # Split on first colon
            if ':' not in line:
                continue
            directive, value = line.split(':', 1)
            directive = directive.strip().lower()
            value = value.strip()
            if directive == 'user-agent':
                current_user_agent = value.lower()
                is_googlebot_section = 'googlebot' in current_user_agent
                is_all_section = current_user_agent == '*'
            elif directive == 'disallow' and value:
                result.disallow_rules.append(value)
                # Check if blocking important paths
                if value == '/' and (is_googlebot_section or is_all_section):
                    if is_googlebot_section:
                        result.blocks_googlebot = True
                    if is_all_section:
                        result.blocks_all_bots = True
            elif directive == 'allow' and value:
                result.allow_rules.append(value)
            elif directive == 'sitemap':
                if value and value not in result.sitemap_urls:
                    result.sitemap_urls.append(value)
            elif directive == 'crawl-delay':
                try:
                    result.crawl_delay = float(value)
                except ValueError:
                    pass
        # Deduplicate
        result.disallow_rules = list(dict.fromkeys(result.disallow_rules))
        result.allow_rules = list(dict.fromkeys(result.allow_rules))
    def check_sitemap(self, sitemap_url: str) -> SitemapResult:
        """
        Check sitemap.xml file.
        Args:
            sitemap_url: URL of the sitemap.
        Returns:
            SitemapResult with sitemap analysis.
        """
        result = SitemapResult()
        result.url = sitemap_url
        try:
            response = self.session.get(sitemap_url, timeout=self.timeout)
            result.status_code = response.status_code
            if response.status_code == 200:
                result.exists = True
                result.content_length = len(response.content)
                # Check Last-Modified header
                last_modified = response.headers.get('Last-Modified')
                if last_modified:
                    result.last_modified = last_modified
                # Parse XML
                self._parse_sitemap(response.content, result)
            elif response.status_code == 404:
                result.exists = False
            else:
                result.errors.append(f"Unexpected status code: {response.status_code}")
        except requests.exceptions.Timeout:
            result.errors.append("Timeout fetching sitemap")
        except requests.exceptions.ConnectionError as e:
            result.errors.append(f"Connection error: {str(e)[:100]}")
        except requests.exceptions.RequestException as e:
            result.errors.append(f"Request error: {str(e)[:100]}")
        return result
    def _parse_sitemap(self, content: bytes, result: SitemapResult) -> None:
        """Parse sitemap XML content and populate result."""
        try:
            # Try to parse as XML
            root = ET.fromstring(content)
            result.is_valid_xml = True
            # Check namespace (handle both with and without namespace)
            ns = {'sm': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
            # Check if it's a sitemap index
            sitemap_tags = root.findall('.//sm:sitemap', ns) or root.findall('.//sitemap')
            if sitemap_tags:
                result.is_sitemap_index = True
                result.sitemap_count = len(sitemap_tags)
                # Get sample sitemap URLs
                for sitemap_tag in sitemap_tags[:5]:
                    loc = sitemap_tag.find('sm:loc', ns) or sitemap_tag.find('loc')
                    if loc is not None and loc.text:
                        result.sample_urls.append(loc.text)
            else:
                # Regular sitemap
                url_tags = root.findall('.//sm:url', ns) or root.findall('.//url')
                result.url_count = len(url_tags)
                # Get sample URLs
                for url_tag in url_tags[:10]:
                    loc = url_tag.find('sm:loc', ns) or url_tag.find('loc')
                    if loc is not None and loc.text:
                        result.sample_urls.append(loc.text)
        except ET.ParseError as e:
            result.is_valid_xml = False
            result.errors.append(f"Invalid XML: {str(e)[:100]}")
        except Exception as e:
            result.errors.append(f"Error parsing sitemap: {str(e)[:100]}")
    def check_redirect_chain(self, url: str) -> RedirectChainResult:
        """
        Check redirect chain for a URL.
        Args:
            url: The URL to check.
        Returns:
            RedirectChainResult with redirect chain analysis.
        """
        result = RedirectChainResult(original_url=url, final_url=url)
        visited_urls = set()
        current_url = url
        start_time = time.time()
        for i in range(MAX_REDIRECTS):
            if current_url in visited_urls:
                result.has_redirect_loop = True
                result.errors.append(f"Redirect loop detected at: {current_url}")
                break
            visited_urls.add(current_url)
            try:
                response = self.session.get(
                    current_url,
                    timeout=self.timeout,
                    allow_redirects=False
                )
                # Check for redirect
                if response.status_code in (301, 302, 303, 307, 308):
                    next_url = response.headers.get('Location')
                    if not next_url:
                        result.errors.append("Redirect without Location header")
                        break
                    # Handle relative redirects
                    if not next_url.startswith(('http://', 'https://')):
                        parsed = urlparse(current_url)
                        if next_url.startswith('/'):
                            next_url = f"{parsed.scheme}://{parsed.netloc}{next_url}"
                        else:
                            next_url = urljoin(current_url, next_url)
                    # Create redirect info
                    parsed_from = urlparse(current_url)
                    parsed_to = urlparse(next_url)
                    redirect_info = RedirectInfo(
                        from_url=current_url,
                        to_url=next_url,
                        status_code=response.status_code,
                        is_https_upgrade=(
                            parsed_from.scheme == 'http' and
                            parsed_to.scheme == 'https' and
                            parsed_from.netloc.replace('www.', '') == parsed_to.netloc.replace('www.', '')
                        ),
                        is_www_redirect=(
                            parsed_from.netloc.replace('www.', '') == parsed_to.netloc.replace('www.', '') and
                            parsed_from.netloc != parsed_to.netloc
                        )
                    )
                    result.redirects.append(redirect_info)
                    # Check for mixed content
                    if len(result.redirects) >= 2:
                        schemes = [urlparse(r.from_url).scheme for r in result.redirects]
                        schemes.append(parsed_to.scheme)
                        if 'http' in schemes and 'https' in schemes:
                            if schemes.index('https') < len(schemes) - 1 and 'http' in schemes[schemes.index('https'):]:
                                result.has_mixed_content = True
                    current_url = next_url
                else:
                    # No more redirects
                    result.final_url = current_url
                    break
            except requests.exceptions.Timeout:
                result.errors.append(f"Timeout at: {current_url}")
                break
            except requests.exceptions.ConnectionError as e:
                result.errors.append(f"Connection error at {current_url}: {str(e)[:50]}")
                break
            except requests.exceptions.RequestException as e:
                result.errors.append(f"Request error: {str(e)[:100]}")
                break
        result.chain_length = len(result.redirects)
        result.total_time_ms = int((time.time() - start_time) * 1000)
        return result
    def _check_canonical(self, html: str, current_url: str) -> CanonicalResult:
        """
        Check canonical URL configuration from HTML.
        Args:
            html: HTML content of the page.
            current_url: Current URL of the page.
        Returns:
            CanonicalResult with canonical URL analysis.
        """
        result = CanonicalResult()
        try:
            soup = BeautifulSoup(html, 'lxml')
        except Exception:
            try:
                soup = BeautifulSoup(html, 'html.parser')
            except Exception as e:
                result.errors.append(f"Failed to parse HTML: {str(e)[:100]}")
                return result
        # Find canonical link
        canonical_tag = soup.find('link', rel='canonical')
        if canonical_tag:
            result.has_canonical = True
            canonical_url = canonical_tag.get('href', '')
            result.canonical_url = canonical_url
            if canonical_url:
                # Check if relative
                result.is_relative = not canonical_url.startswith(('http://', 'https://'))
                # Parse canonical URL
                if result.is_relative:
                    # Make it absolute for comparison
                    parsed_current = urlparse(current_url)
                    if canonical_url.startswith('/'):
                        canonical_abs = f"{parsed_current.scheme}://{parsed_current.netloc}{canonical_url}"
                    else:
                        canonical_abs = urljoin(current_url, canonical_url)
                else:
                    canonical_abs = canonical_url
                parsed_canonical = urlparse(canonical_abs)
                parsed_current = urlparse(current_url)
                # Check if valid URL
                result.is_valid_url = bool(parsed_canonical.scheme and parsed_canonical.netloc)
                # Check if self-referencing
                result.is_self_referencing = (
                    parsed_canonical.netloc.replace('www.', '') == parsed_current.netloc.replace('www.', '') and
                    parsed_canonical.path == parsed_current.path
                )
                # Check if points to different domain
                result.points_to_different_domain = (
                    parsed_canonical.netloc.replace('www.', '') != parsed_current.netloc.replace('www.', '')
                )
                # Check if matches current URL exactly
                result.matches_current_url = (canonical_abs.rstrip('/') == current_url.rstrip('/'))
        return result
    def _check_indexability(self, response: requests.Response) -> IndexabilityResult:
        """
        Check if page is indexable based on meta tags and HTTP headers.
        Args:
            response: Response object from fetching the page.
        Returns:
            IndexabilityResult with indexability analysis.
        """
        result = IndexabilityResult()
        # Check X-Robots-Tag HTTP header
        x_robots = response.headers.get('X-Robots-Tag', '')
        if x_robots:
            result.x_robots_tag = x_robots
            if 'noindex' in x_robots.lower():
                result.has_noindex_header = True
                result.is_indexable = False
                result.noindex_source = 'header'
        # Check meta robots tag in HTML
        try:
            soup = BeautifulSoup(response.text, 'lxml')
        except Exception:
            try:
                soup = BeautifulSoup(response.text, 'html.parser')
            except Exception as e:
                result.errors.append(f"Failed to parse HTML: {str(e)[:100]}")
                return result
        # Find meta robots
        meta_robots = soup.find('meta', attrs={'name': re.compile(r'^robots$', re.I)})
        if meta_robots:
            content = meta_robots.get('content', '')
            result.meta_robots_content = content
            if 'noindex' in content.lower():
                result.has_noindex_meta = True
                result.is_indexable = False
                if not result.noindex_source:
                    result.noindex_source = 'meta'
        # Also check googlebot-specific meta
        meta_googlebot = soup.find('meta', attrs={'name': re.compile(r'^googlebot$', re.I)})
        if meta_googlebot:
            content = meta_googlebot.get('content', '')
            if 'noindex' in content.lower():
                result.has_noindex_meta = True
                result.is_indexable = False
                if not result.noindex_source:
                    result.noindex_source = 'meta'
        return result
 # Convenience function
 def analyze_html(html: str, base_url: str = '') -> Dict[str, Any]:
    """
@ -767,20 +1407,139 @@ def analyze_html(html: str, base_url: str = '') -> Dict[str, Any]:
    return result.to_dict()
 def check_technical_seo(url: str) -> Dict[str, Any]:
    """
    Convenience function for technical SEO check.
    Args:
        url: The URL to check.
    Returns:
        Dict with technical SEO analysis results.
    """
    checker = TechnicalSEOChecker()
    result = checker.check_url(url)
    return result.to_dict()
 if __name__ == '__main__':
    import sys
-    import requests
+    import argparse
-    if len(sys.argv) < 2:
+    parser = argparse.ArgumentParser(description='SEO Analyzer for websites')
-        print("Usage: python seo_analyzer.py <url>")
+    parser.add_argument('url', help='URL to analyze')
-        print("Example: python seo_analyzer.py https://pixlab.pl")
+    parser.add_argument('--technical', '-t', action='store_true',
-        sys.exit(1)
+                        help='Run technical SEO checks (robots.txt, sitemap, redirects)')
    parser.add_argument('--all', '-a', action='store_true',
                        help='Run both on-page and technical SEO analysis')
    parser.add_argument('--json', '-j', action='store_true',
                        help='Output results as JSON')
-    test_url = sys.argv[1]
+    args = parser.parse_args()
    test_url = args.url
    print(f"Analyzing: {test_url}")
    print("-" * 60)
    # Run technical SEO checks if requested
    if args.technical or args.all:
        print("\n" + "=" * 60)
        print("TECHNICAL SEO ANALYSIS")
        print("=" * 60)
        checker = TechnicalSEOChecker()
        tech_result = checker.check_url(test_url)
        if args.json:
            print(json.dumps(tech_result.to_dict(), indent=2, default=str))
        else:
            print("\n=== ROBOTS.TXT ===")
            print(f"Exists: {tech_result.robots_txt.exists}")
            print(f"URL: {tech_result.robots_txt.url}")
            print(f"Status code: {tech_result.robots_txt.status_code}")
            if tech_result.robots_txt.exists:
                print(f"Disallow rules: {len(tech_result.robots_txt.disallow_rules)}")
                if tech_result.robots_txt.disallow_rules[:5]:
                    print(f"  Sample: {tech_result.robots_txt.disallow_rules[:5]}")
                print(f"Sitemap URLs: {tech_result.robots_txt.sitemap_urls}")
                print(f"Blocks Googlebot: {tech_result.robots_txt.blocks_googlebot}")
                print(f"Blocks all bots: {tech_result.robots_txt.blocks_all_bots}")
                if tech_result.robots_txt.crawl_delay:
                    print(f"Crawl delay: {tech_result.robots_txt.crawl_delay}")
            if tech_result.robots_txt.errors:
                print(f"Errors: {tech_result.robots_txt.errors}")
            print("\n=== SITEMAP ===")
            print(f"Exists: {tech_result.sitemap.exists}")
            print(f"URL: {tech_result.sitemap.url}")
            print(f"Status code: {tech_result.sitemap.status_code}")
            if tech_result.sitemap.exists:
                print(f"Valid XML: {tech_result.sitemap.is_valid_xml}")
                print(f"Is sitemap index: {tech_result.sitemap.is_sitemap_index}")
                if tech_result.sitemap.is_sitemap_index:
                    print(f"Sitemap count: {tech_result.sitemap.sitemap_count}")
                else:
                    print(f"URL count: {tech_result.sitemap.url_count}")
                if tech_result.sitemap.sample_urls:
                    print(f"Sample URLs: {tech_result.sitemap.sample_urls[:3]}")
            if tech_result.sitemap.errors:
                print(f"Errors: {tech_result.sitemap.errors}")
            print("\n=== REDIRECT CHAIN ===")
            print(f"Original URL: {tech_result.redirect_chain.original_url}")
            print(f"Final URL: {tech_result.redirect_chain.final_url}")
            print(f"Chain length: {tech_result.redirect_chain.chain_length}")
            if tech_result.redirect_chain.redirects:
                for i, r in enumerate(tech_result.redirect_chain.redirects[:5]):
                    print(f"  [{i+1}] {r.status_code}: {r.from_url[:50]}... -> {r.to_url[:50]}...")
                    if r.is_https_upgrade:
                        print(f"       (HTTPS upgrade)")
                    if r.is_www_redirect:
                        print(f"       (www redirect)")
            print(f"Has redirect loop: {tech_result.redirect_chain.has_redirect_loop}")
            print(f"Has mixed content: {tech_result.redirect_chain.has_mixed_content}")
            print(f"Total time: {tech_result.redirect_chain.total_time_ms}ms")
            if tech_result.redirect_chain.errors:
                print(f"Errors: {tech_result.redirect_chain.errors}")
            print("\n=== CANONICAL ===")
            print(f"Has canonical: {tech_result.canonical.has_canonical}")
            if tech_result.canonical.has_canonical:
                print(f"Canonical URL: {tech_result.canonical.canonical_url}")
                print(f"Is self-referencing: {tech_result.canonical.is_self_referencing}")
                print(f"Points to different domain: {tech_result.canonical.points_to_different_domain}")
                print(f"Is relative: {tech_result.canonical.is_relative}")
                print(f"Is valid URL: {tech_result.canonical.is_valid_url}")
            if tech_result.canonical.errors:
                print(f"Errors: {tech_result.canonical.errors}")
            print("\n=== INDEXABILITY ===")
            print(f"Is indexable: {tech_result.indexability.is_indexable}")
            print(f"Has noindex meta: {tech_result.indexability.has_noindex_meta}")
            print(f"Has noindex header: {tech_result.indexability.has_noindex_header}")
            if tech_result.indexability.noindex_source:
                print(f"Noindex source: {tech_result.indexability.noindex_source}")
            if tech_result.indexability.meta_robots_content:
                print(f"Meta robots: {tech_result.indexability.meta_robots_content}")
            if tech_result.indexability.x_robots_tag:
                print(f"X-Robots-Tag: {tech_result.indexability.x_robots_tag}")
            if tech_result.indexability.errors:
                print(f"Errors: {tech_result.indexability.errors}")
            if tech_result.errors:
                print(f"\n=== GENERAL ERRORS ===")
                for error in tech_result.errors:
                    print(f"  - {error}")
        # If only technical was requested, exit
        if not args.all:
            sys.exit(0)
    # Run on-page analysis (default behavior)
    print("\n" + "=" * 60)
    print("ON-PAGE SEO ANALYSIS")
    print("=" * 60)
    # Fetch the page
    try:
        headers = {
@ -797,65 +1556,68 @@ if __name__ == '__main__':
    analyzer = OnPageSEOAnalyzer()
    result = analyzer.analyze_html(html, test_url)
-    # Print results
+    if args.json:
-    print("\n=== META TAGS ===")
+        print(json.dumps(result.to_dict(), indent=2, default=str))
-    print(f"Title: {result.meta_tags.title}")
+    else:
-    print(f"Title length: {result.meta_tags.title_length}")
+        # Print results
-    print(f"Description: {result.meta_tags.description[:100] if result.meta_tags.description else 'None'}...")
+        print("\n=== META TAGS ===")
-    print(f"Description length: {result.meta_tags.description_length}")
+        print(f"Title: {result.meta_tags.title}")
-    print(f"Canonical: {result.meta_tags.canonical_url}")
+        print(f"Title length: {result.meta_tags.title_length}")
-    print(f"Robots: {result.meta_tags.robots}")
+        print(f"Description: {result.meta_tags.description[:100] if result.meta_tags.description else 'None'}...")
-    print(f"Viewport: {result.meta_tags.viewport}")
+        print(f"Description length: {result.meta_tags.description_length}")
        print(f"Canonical: {result.meta_tags.canonical_url}")
        print(f"Robots: {result.meta_tags.robots}")
        print(f"Viewport: {result.meta_tags.viewport}")
-    print("\n=== OPEN GRAPH ===")
+        print("\n=== OPEN GRAPH ===")
-    print(f"OG Title: {result.open_graph.og_title}")
+        print(f"OG Title: {result.open_graph.og_title}")
-    print(f"OG Description: {result.open_graph.og_description[:100] if result.open_graph.og_description else 'None'}...")
+        print(f"OG Description: {result.open_graph.og_description[:100] if result.open_graph.og_description else 'None'}...")
-    print(f"OG Image: {result.open_graph.og_image}")
+        print(f"OG Image: {result.open_graph.og_image}")
-    print(f"OG Type: {result.open_graph.og_type}")
+        print(f"OG Type: {result.open_graph.og_type}")
-    print("\n=== TWITTER CARD ===")
+        print("\n=== TWITTER CARD ===")
-    print(f"Card Type: {result.twitter_card.card_type}")
+        print(f"Card Type: {result.twitter_card.card_type}")
-    print(f"Title: {result.twitter_card.title}")
+        print(f"Title: {result.twitter_card.title}")
-    print("\n=== HEADINGS ===")
+        print("\n=== HEADINGS ===")
-    print(f"H1: {result.headings.h1_count} ({result.headings.h1_texts})")
+        print(f"H1: {result.headings.h1_count} ({result.headings.h1_texts})")
-    print(f"H2: {result.headings.h2_count}")
+        print(f"H2: {result.headings.h2_count}")
-    print(f"H3: {result.headings.h3_count}")
+        print(f"H3: {result.headings.h3_count}")
-    print(f"H4: {result.headings.h4_count}")
+        print(f"H4: {result.headings.h4_count}")
-    print(f"H5: {result.headings.h5_count}")
+        print(f"H5: {result.headings.h5_count}")
-    print(f"H6: {result.headings.h6_count}")
+        print(f"H6: {result.headings.h6_count}")
-    print(f"Has single H1: {result.headings.has_single_h1}")
+        print(f"Has single H1: {result.headings.has_single_h1}")
-    print(f"Has proper hierarchy: {result.headings.has_proper_hierarchy}")
+        print(f"Has proper hierarchy: {result.headings.has_proper_hierarchy}")
-    if result.headings.hierarchy_issues:
+        if result.headings.hierarchy_issues:
-        print(f"Hierarchy issues: {result.headings.hierarchy_issues}")
+            print(f"Hierarchy issues: {result.headings.hierarchy_issues}")
-    print("\n=== IMAGES ===")
+        print("\n=== IMAGES ===")
-    print(f"Total images: {result.images.total_images}")
+        print(f"Total images: {result.images.total_images}")
-    print(f"With alt: {result.images.images_with_alt}")
+        print(f"With alt: {result.images.images_with_alt}")
-    print(f"Without alt: {result.images.images_without_alt}")
+        print(f"Without alt: {result.images.images_without_alt}")
-    print(f"With empty alt: {result.images.images_with_empty_alt}")
+        print(f"With empty alt: {result.images.images_with_empty_alt}")
-    if result.images.alt_text_quality_issues:
+        if result.images.alt_text_quality_issues:
-        print(f"Alt quality issues: {len(result.images.alt_text_quality_issues)}")
+            print(f"Alt quality issues: {len(result.images.alt_text_quality_issues)}")
-    print("\n=== LINKS ===")
+        print("\n=== LINKS ===")
-    print(f"Total links: {result.links.total_links}")
+        print(f"Total links: {result.links.total_links}")
-    print(f"Internal: {result.links.internal_links}")
+        print(f"Internal: {result.links.internal_links}")
-    print(f"External: {result.links.external_links}")
+        print(f"External: {result.links.external_links}")
-    print(f"Nofollow: {result.links.nofollow_links}")
+        print(f"Nofollow: {result.links.nofollow_links}")
-    print(f"Broken anchor links: {result.links.broken_anchor_links}")
+        print(f"Broken anchor links: {result.links.broken_anchor_links}")
-    print(f"External domains: {result.links.unique_external_domains[:5]}")
+        print(f"External domains: {result.links.unique_external_domains[:5]}")
-    print("\n=== STRUCTURED DATA ===")
+        print("\n=== STRUCTURED DATA ===")
-    print(f"Has structured data: {result.structured_data.has_structured_data}")
+        print(f"Has structured data: {result.structured_data.has_structured_data}")
-    print(f"JSON-LD count: {result.structured_data.json_ld_count}")
+        print(f"JSON-LD count: {result.structured_data.json_ld_count}")
-    print(f"Microdata count: {result.structured_data.microdata_count}")
+        print(f"Microdata count: {result.structured_data.microdata_count}")
-    print(f"RDFa count: {result.structured_data.rdfa_count}")
+        print(f"RDFa count: {result.structured_data.rdfa_count}")
-    print(f"Schema types: {result.structured_data.all_types}")
+        print(f"Schema types: {result.structured_data.all_types}")
-    print("\n=== OTHER ===")
+        print("\n=== OTHER ===")
-    print(f"Word count: {result.word_count}")
+        print(f"Word count: {result.word_count}")
-    print(f"Has DOCTYPE: {result.has_doctype}")
+        print(f"Has DOCTYPE: {result.has_doctype}")
-    print(f"Has lang attribute: {result.has_lang_attribute} ({result.lang_attribute})")
+        print(f"Has lang attribute: {result.has_lang_attribute} ({result.lang_attribute})")
-    if result.errors:
+        if result.errors:
-        print(f"\nErrors: {result.errors}")
+            print(f"\nErrors: {result.errors}")