From 81fc27dfa9e02385d9b2bc26c4ca13af585609a5 Mon Sep 17 00:00:00 2001
From: Maciej Pienczyn <maciej.pienczyn@inpi.pl>
Date: Thu, 8 Jan 2026 02:12:47 +0100
Subject: [PATCH] auto-claude: 3.2 - Add TechnicalSEOChecker class to
 scripts/seo_analyzer.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds TechnicalSEOChecker class that performs technical SEO audits:
- robots.txt: checks existence, parses directives (Disallow, Allow, Sitemap)
  detects if blocks Googlebot or all bots
- sitemap.xml: checks existence, validates XML, counts URLs, detects sitemap index
- Canonical URLs: detects canonical tag, checks if self-referencing or cross-domain
- Noindex tags: checks meta robots and X-Robots-Tag HTTP header
- Redirect chains: follows up to 10 redirects, detects loops, HTTPS upgrades,
  www redirects, and mixed content issues

Includes:
- 8 dataclasses for structured results (RobotsTxtResult, SitemapResult, etc.)
- TechnicalSEOResult container for complete analysis
- check_technical_seo() convenience function
- CLI support: --technical/-t flag for technical-only analysis
- --all/-a flag for combined on-page and technical analysis
- --json/-j flag for JSON output

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 scripts/seo_analyzer.py | 886 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 824 insertions(+), 62 deletions(-)

diff --git a/scripts/seo_analyzer.py b/scripts/seo_analyzer.py
index 22ea9b1..6de4c62 100644
--- a/scripts/seo_analyzer.py
+++ b/scripts/seo_analyzer.py
@@ -11,12 +11,24 @@ Analyzes HTML content for SEO factors including:
 - Structured data detection (JSON-LD, Microdata, RDFa)
 - Open Graph and Twitter Card metadata
 
-Usage:
-    from seo_analyzer import OnPageSEOAnalyzer
+Also includes TechnicalSEOChecker for:
+- robots.txt analysis
+- sitemap.xml validation
+- Canonical URL verification
+- Noindex tag detection
+- Redirect chain analysis
 
+Usage:
+    from seo_analyzer import OnPageSEOAnalyzer, TechnicalSEOChecker
+
+    # On-page analysis
     analyzer = OnPageSEOAnalyzer()
     result = analyzer.analyze_html(html_content, base_url='https://example.com')
 
+    # Technical SEO checks
+    checker = TechnicalSEOChecker()
+    tech_result = checker.check_url('https://example.com')
+
 Author: Claude Code
 Date: 2026-01-08
 """
@@ -24,10 +36,13 @@ Date: 2026-01-08
 import json
 import re
 import logging
+import time
+import xml.etree.ElementTree as ET
 from typing import Optional, Dict, List, Any, Tuple
 from dataclasses import dataclass, field, asdict
 from urllib.parse import urlparse, urljoin
 
+import requests
 from bs4 import BeautifulSoup, Comment
 
 # Configure logging
@@ -750,6 +765,631 @@ class OnPageSEOAnalyzer:
         return 0
 
 
+# =============================================================================
+# Technical SEO Checker
+# =============================================================================
+
+# Request configuration for TechnicalSEOChecker
+REQUEST_TIMEOUT = 15
+USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 NordaBiznes-SEO-Checker/1.0'
+
+# Maximum redirects to follow
+MAX_REDIRECTS = 10
+
+
+@dataclass
+class RobotsTxtResult:
+    """Analysis of robots.txt file."""
+    exists: bool = False
+    url: Optional[str] = None
+    status_code: Optional[int] = None
+    content: Optional[str] = None
+    content_length: Optional[int] = None
+    disallow_rules: List[str] = field(default_factory=list)
+    allow_rules: List[str] = field(default_factory=list)
+    sitemap_urls: List[str] = field(default_factory=list)
+    crawl_delay: Optional[float] = None
+    blocks_googlebot: bool = False
+    blocks_all_bots: bool = False
+    errors: List[str] = field(default_factory=list)
+
+    def to_dict(self) -> Dict[str, Any]:
+        return asdict(self)
+
+
+@dataclass
+class SitemapResult:
+    """Analysis of sitemap.xml file."""
+    exists: bool = False
+    url: Optional[str] = None
+    status_code: Optional[int] = None
+    is_valid_xml: bool = False
+    is_sitemap_index: bool = False
+    url_count: int = 0
+    sitemap_count: int = 0  # For sitemap index
+    sample_urls: List[str] = field(default_factory=list)
+    last_modified: Optional[str] = None
+    content_length: Optional[int] = None
+    errors: List[str] = field(default_factory=list)
+
+    def to_dict(self) -> Dict[str, Any]:
+        return asdict(self)
+
+
+@dataclass
+class RedirectInfo:
+    """Information about a single redirect."""
+    from_url: str
+    to_url: str
+    status_code: int
+    is_https_upgrade: bool = False
+    is_www_redirect: bool = False
+
+    def to_dict(self) -> Dict[str, Any]:
+        return asdict(self)
+
+
+@dataclass
+class RedirectChainResult:
+    """Analysis of redirect chain for a URL."""
+    original_url: str
+    final_url: str
+    chain_length: int = 0
+    redirects: List[RedirectInfo] = field(default_factory=list)
+    has_redirect_loop: bool = False
+    has_mixed_content: bool = False  # HTTP -> HTTPS -> HTTP
+    total_time_ms: Optional[int] = None
+    errors: List[str] = field(default_factory=list)
+
+    def to_dict(self) -> Dict[str, Any]:
+        result = asdict(self)
+        result['redirects'] = [r.to_dict() if hasattr(r, 'to_dict') else r for r in self.redirects]
+        return result
+
+
+@dataclass
+class CanonicalResult:
+    """Analysis of canonical URL configuration."""
+    has_canonical: bool = False
+    canonical_url: Optional[str] = None
+    is_self_referencing: bool = False
+    points_to_different_domain: bool = False
+    is_relative: bool = False
+    is_valid_url: bool = False
+    matches_current_url: bool = False
+    errors: List[str] = field(default_factory=list)
+
+    def to_dict(self) -> Dict[str, Any]:
+        return asdict(self)
+
+
+@dataclass
+class IndexabilityResult:
+    """Analysis of page indexability."""
+    is_indexable: bool = True
+    has_noindex_meta: bool = False
+    has_noindex_header: bool = False
+    noindex_source: Optional[str] = None  # 'meta', 'header', 'robots.txt'
+    meta_robots_content: Optional[str] = None
+    x_robots_tag: Optional[str] = None
+    errors: List[str] = field(default_factory=list)
+
+    def to_dict(self) -> Dict[str, Any]:
+        return asdict(self)
+
+
+@dataclass
+class TechnicalSEOResult:
+    """Complete technical SEO check result."""
+    url: str
+    checked_at: str
+    robots_txt: RobotsTxtResult
+    sitemap: SitemapResult
+    redirect_chain: RedirectChainResult
+    canonical: CanonicalResult
+    indexability: IndexabilityResult
+    errors: List[str] = field(default_factory=list)
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            'url': self.url,
+            'checked_at': self.checked_at,
+            'robots_txt': self.robots_txt.to_dict(),
+            'sitemap': self.sitemap.to_dict(),
+            'redirect_chain': self.redirect_chain.to_dict(),
+            'canonical': self.canonical.to_dict(),
+            'indexability': self.indexability.to_dict(),
+            'errors': self.errors,
+        }
+
+
+class TechnicalSEOChecker:
+    """
+    Checks technical SEO factors for a website.
+
+    Analyzes:
+    - robots.txt presence and configuration
+    - sitemap.xml presence and validity
+    - Canonical URL configuration
+    - Noindex tags (meta and HTTP header)
+    - Redirect chains
+
+    Usage:
+        checker = TechnicalSEOChecker()
+        result = checker.check_url('https://example.com')
+
+        # Access specific results
+        print(f"robots.txt exists: {result.robots_txt.exists}")
+        print(f"sitemap.xml exists: {result.sitemap.exists}")
+        print(f"Redirect chain length: {result.redirect_chain.chain_length}")
+        print(f"Is indexable: {result.indexability.is_indexable}")
+    """
+
+    def __init__(self, timeout: int = REQUEST_TIMEOUT):
+        """
+        Initialize the TechnicalSEOChecker.
+
+        Args:
+            timeout: Request timeout in seconds.
+        """
+        self.timeout = timeout
+        self.session = requests.Session()
+        self.session.headers.update({'User-Agent': USER_AGENT})
+
+    def check_url(self, url: str) -> TechnicalSEOResult:
+        """
+        Perform complete technical SEO check for a URL.
+
+        Args:
+            url: The URL to check.
+
+        Returns:
+            TechnicalSEOResult with all technical SEO analysis.
+        """
+        from datetime import datetime
+
+        errors = []
+
+        # Normalize URL
+        if not url.startswith(('http://', 'https://')):
+            url = 'https://' + url
+
+        parsed = urlparse(url)
+        base_url = f"{parsed.scheme}://{parsed.netloc}"
+
+        # Check robots.txt
+        robots_result = self.check_robots_txt(base_url)
+
+        # Check sitemap.xml (use sitemap from robots.txt if available)
+        sitemap_urls = robots_result.sitemap_urls if robots_result.sitemap_urls else [f"{base_url}/sitemap.xml"]
+        sitemap_result = self.check_sitemap(sitemap_urls[0] if sitemap_urls else f"{base_url}/sitemap.xml")
+
+        # Check redirect chain
+        redirect_result = self.check_redirect_chain(url)
+
+        # Fetch page for canonical and indexability checks
+        canonical_result = CanonicalResult()
+        indexability_result = IndexabilityResult()
+
+        try:
+            response = self.session.get(url, timeout=self.timeout, allow_redirects=True)
+            final_url = response.url
+
+            # Parse HTML for canonical and noindex
+            if response.status_code == 200:
+                canonical_result = self._check_canonical(response.text, final_url)
+                indexability_result = self._check_indexability(response)
+            else:
+                errors.append(f"HTTP {response.status_code} when fetching page")
+
+        except requests.exceptions.Timeout:
+            errors.append(f"Timeout fetching {url}")
+        except requests.exceptions.ConnectionError as e:
+            errors.append(f"Connection error: {str(e)[:100]}")
+        except requests.exceptions.RequestException as e:
+            errors.append(f"Request error: {str(e)[:100]}")
+
+        return TechnicalSEOResult(
+            url=url,
+            checked_at=datetime.now().isoformat(),
+            robots_txt=robots_result,
+            sitemap=sitemap_result,
+            redirect_chain=redirect_result,
+            canonical=canonical_result,
+            indexability=indexability_result,
+            errors=errors,
+        )
+
+    def check_robots_txt(self, base_url: str) -> RobotsTxtResult:
+        """
+        Check robots.txt file for a domain.
+
+        Args:
+            base_url: Base URL of the site (e.g., 'https://example.com').
+
+        Returns:
+            RobotsTxtResult with robots.txt analysis.
+        """
+        result = RobotsTxtResult()
+        robots_url = f"{base_url.rstrip('/')}/robots.txt"
+        result.url = robots_url
+
+        try:
+            response = self.session.get(robots_url, timeout=self.timeout)
+            result.status_code = response.status_code
+
+            if response.status_code == 200:
+                result.exists = True
+                result.content = response.text
+                result.content_length = len(response.text)
+
+                # Parse robots.txt
+                self._parse_robots_txt(response.text, result)
+            elif response.status_code == 404:
+                result.exists = False
+            else:
+                result.errors.append(f"Unexpected status code: {response.status_code}")
+
+        except requests.exceptions.Timeout:
+            result.errors.append("Timeout fetching robots.txt")
+        except requests.exceptions.ConnectionError as e:
+            result.errors.append(f"Connection error: {str(e)[:100]}")
+        except requests.exceptions.RequestException as e:
+            result.errors.append(f"Request error: {str(e)[:100]}")
+
+        return result
+
+    def _parse_robots_txt(self, content: str, result: RobotsTxtResult) -> None:
+        """Parse robots.txt content and populate result."""
+        current_user_agent = None
+        is_googlebot_section = False
+        is_all_section = False
+
+        for line in content.split('\n'):
+            line = line.strip()
+
+            # Skip empty lines and comments
+            if not line or line.startswith('#'):
+                continue
+
+            # Split on first colon
+            if ':' not in line:
+                continue
+
+            directive, value = line.split(':', 1)
+            directive = directive.strip().lower()
+            value = value.strip()
+
+            if directive == 'user-agent':
+                current_user_agent = value.lower()
+                is_googlebot_section = 'googlebot' in current_user_agent
+                is_all_section = current_user_agent == '*'
+
+            elif directive == 'disallow' and value:
+                result.disallow_rules.append(value)
+                # Check if blocking important paths
+                if value == '/' and (is_googlebot_section or is_all_section):
+                    if is_googlebot_section:
+                        result.blocks_googlebot = True
+                    if is_all_section:
+                        result.blocks_all_bots = True
+
+            elif directive == 'allow' and value:
+                result.allow_rules.append(value)
+
+            elif directive == 'sitemap':
+                if value and value not in result.sitemap_urls:
+                    result.sitemap_urls.append(value)
+
+            elif directive == 'crawl-delay':
+                try:
+                    result.crawl_delay = float(value)
+                except ValueError:
+                    pass
+
+        # Deduplicate
+        result.disallow_rules = list(dict.fromkeys(result.disallow_rules))
+        result.allow_rules = list(dict.fromkeys(result.allow_rules))
+
+    def check_sitemap(self, sitemap_url: str) -> SitemapResult:
+        """
+        Check sitemap.xml file.
+
+        Args:
+            sitemap_url: URL of the sitemap.
+
+        Returns:
+            SitemapResult with sitemap analysis.
+        """
+        result = SitemapResult()
+        result.url = sitemap_url
+
+        try:
+            response = self.session.get(sitemap_url, timeout=self.timeout)
+            result.status_code = response.status_code
+
+            if response.status_code == 200:
+                result.exists = True
+                result.content_length = len(response.content)
+
+                # Check Last-Modified header
+                last_modified = response.headers.get('Last-Modified')
+                if last_modified:
+                    result.last_modified = last_modified
+
+                # Parse XML
+                self._parse_sitemap(response.content, result)
+
+            elif response.status_code == 404:
+                result.exists = False
+            else:
+                result.errors.append(f"Unexpected status code: {response.status_code}")
+
+        except requests.exceptions.Timeout:
+            result.errors.append("Timeout fetching sitemap")
+        except requests.exceptions.ConnectionError as e:
+            result.errors.append(f"Connection error: {str(e)[:100]}")
+        except requests.exceptions.RequestException as e:
+            result.errors.append(f"Request error: {str(e)[:100]}")
+
+        return result
+
+    def _parse_sitemap(self, content: bytes, result: SitemapResult) -> None:
+        """Parse sitemap XML content and populate result."""
+        try:
+            # Try to parse as XML
+            root = ET.fromstring(content)
+            result.is_valid_xml = True
+
+            # Check namespace (handle both with and without namespace)
+            ns = {'sm': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
+
+            # Check if it's a sitemap index
+            sitemap_tags = root.findall('.//sm:sitemap', ns) or root.findall('.//sitemap')
+            if sitemap_tags:
+                result.is_sitemap_index = True
+                result.sitemap_count = len(sitemap_tags)
+
+                # Get sample sitemap URLs
+                for sitemap_tag in sitemap_tags[:5]:
+                    loc = sitemap_tag.find('sm:loc', ns) or sitemap_tag.find('loc')
+                    if loc is not None and loc.text:
+                        result.sample_urls.append(loc.text)
+            else:
+                # Regular sitemap
+                url_tags = root.findall('.//sm:url', ns) or root.findall('.//url')
+                result.url_count = len(url_tags)
+
+                # Get sample URLs
+                for url_tag in url_tags[:10]:
+                    loc = url_tag.find('sm:loc', ns) or url_tag.find('loc')
+                    if loc is not None and loc.text:
+                        result.sample_urls.append(loc.text)
+
+        except ET.ParseError as e:
+            result.is_valid_xml = False
+            result.errors.append(f"Invalid XML: {str(e)[:100]}")
+        except Exception as e:
+            result.errors.append(f"Error parsing sitemap: {str(e)[:100]}")
+
+    def check_redirect_chain(self, url: str) -> RedirectChainResult:
+        """
+        Check redirect chain for a URL.
+
+        Args:
+            url: The URL to check.
+
+        Returns:
+            RedirectChainResult with redirect chain analysis.
+        """
+        result = RedirectChainResult(original_url=url, final_url=url)
+        visited_urls = set()
+        current_url = url
+        start_time = time.time()
+
+        for i in range(MAX_REDIRECTS):
+            if current_url in visited_urls:
+                result.has_redirect_loop = True
+                result.errors.append(f"Redirect loop detected at: {current_url}")
+                break
+
+            visited_urls.add(current_url)
+
+            try:
+                response = self.session.get(
+                    current_url,
+                    timeout=self.timeout,
+                    allow_redirects=False
+                )
+
+                # Check for redirect
+                if response.status_code in (301, 302, 303, 307, 308):
+                    next_url = response.headers.get('Location')
+                    if not next_url:
+                        result.errors.append("Redirect without Location header")
+                        break
+
+                    # Handle relative redirects
+                    if not next_url.startswith(('http://', 'https://')):
+                        parsed = urlparse(current_url)
+                        if next_url.startswith('/'):
+                            next_url = f"{parsed.scheme}://{parsed.netloc}{next_url}"
+                        else:
+                            next_url = urljoin(current_url, next_url)
+
+                    # Create redirect info
+                    parsed_from = urlparse(current_url)
+                    parsed_to = urlparse(next_url)
+
+                    redirect_info = RedirectInfo(
+                        from_url=current_url,
+                        to_url=next_url,
+                        status_code=response.status_code,
+                        is_https_upgrade=(
+                            parsed_from.scheme == 'http' and
+                            parsed_to.scheme == 'https' and
+                            parsed_from.netloc.replace('www.', '') == parsed_to.netloc.replace('www.', '')
+                        ),
+                        is_www_redirect=(
+                            parsed_from.netloc.replace('www.', '') == parsed_to.netloc.replace('www.', '') and
+                            parsed_from.netloc != parsed_to.netloc
+                        )
+                    )
+                    result.redirects.append(redirect_info)
+
+                    # Check for mixed content
+                    if len(result.redirects) >= 2:
+                        schemes = [urlparse(r.from_url).scheme for r in result.redirects]
+                        schemes.append(parsed_to.scheme)
+                        if 'http' in schemes and 'https' in schemes:
+                            if schemes.index('https') < len(schemes) - 1 and 'http' in schemes[schemes.index('https'):]:
+                                result.has_mixed_content = True
+
+                    current_url = next_url
+
+                else:
+                    # No more redirects
+                    result.final_url = current_url
+                    break
+
+            except requests.exceptions.Timeout:
+                result.errors.append(f"Timeout at: {current_url}")
+                break
+            except requests.exceptions.ConnectionError as e:
+                result.errors.append(f"Connection error at {current_url}: {str(e)[:50]}")
+                break
+            except requests.exceptions.RequestException as e:
+                result.errors.append(f"Request error: {str(e)[:100]}")
+                break
+
+        result.chain_length = len(result.redirects)
+        result.total_time_ms = int((time.time() - start_time) * 1000)
+
+        return result
+
+    def _check_canonical(self, html: str, current_url: str) -> CanonicalResult:
+        """
+        Check canonical URL configuration from HTML.
+
+        Args:
+            html: HTML content of the page.
+            current_url: Current URL of the page.
+
+        Returns:
+            CanonicalResult with canonical URL analysis.
+        """
+        result = CanonicalResult()
+
+        try:
+            soup = BeautifulSoup(html, 'lxml')
+        except Exception:
+            try:
+                soup = BeautifulSoup(html, 'html.parser')
+            except Exception as e:
+                result.errors.append(f"Failed to parse HTML: {str(e)[:100]}")
+                return result
+
+        # Find canonical link
+        canonical_tag = soup.find('link', rel='canonical')
+
+        if canonical_tag:
+            result.has_canonical = True
+            canonical_url = canonical_tag.get('href', '')
+            result.canonical_url = canonical_url
+
+            if canonical_url:
+                # Check if relative
+                result.is_relative = not canonical_url.startswith(('http://', 'https://'))
+
+                # Parse canonical URL
+                if result.is_relative:
+                    # Make it absolute for comparison
+                    parsed_current = urlparse(current_url)
+                    if canonical_url.startswith('/'):
+                        canonical_abs = f"{parsed_current.scheme}://{parsed_current.netloc}{canonical_url}"
+                    else:
+                        canonical_abs = urljoin(current_url, canonical_url)
+                else:
+                    canonical_abs = canonical_url
+
+                parsed_canonical = urlparse(canonical_abs)
+                parsed_current = urlparse(current_url)
+
+                # Check if valid URL
+                result.is_valid_url = bool(parsed_canonical.scheme and parsed_canonical.netloc)
+
+                # Check if self-referencing
+                result.is_self_referencing = (
+                    parsed_canonical.netloc.replace('www.', '') == parsed_current.netloc.replace('www.', '') and
+                    parsed_canonical.path == parsed_current.path
+                )
+
+                # Check if points to different domain
+                result.points_to_different_domain = (
+                    parsed_canonical.netloc.replace('www.', '') != parsed_current.netloc.replace('www.', '')
+                )
+
+                # Check if matches current URL exactly
+                result.matches_current_url = (canonical_abs.rstrip('/') == current_url.rstrip('/'))
+
+        return result
+
+    def _check_indexability(self, response: requests.Response) -> IndexabilityResult:
+        """
+        Check if page is indexable based on meta tags and HTTP headers.
+
+        Args:
+            response: Response object from fetching the page.
+
+        Returns:
+            IndexabilityResult with indexability analysis.
+        """
+        result = IndexabilityResult()
+
+        # Check X-Robots-Tag HTTP header
+        x_robots = response.headers.get('X-Robots-Tag', '')
+        if x_robots:
+            result.x_robots_tag = x_robots
+            if 'noindex' in x_robots.lower():
+                result.has_noindex_header = True
+                result.is_indexable = False
+                result.noindex_source = 'header'
+
+        # Check meta robots tag in HTML
+        try:
+            soup = BeautifulSoup(response.text, 'lxml')
+        except Exception:
+            try:
+                soup = BeautifulSoup(response.text, 'html.parser')
+            except Exception as e:
+                result.errors.append(f"Failed to parse HTML: {str(e)[:100]}")
+                return result
+
+        # Find meta robots
+        meta_robots = soup.find('meta', attrs={'name': re.compile(r'^robots$', re.I)})
+        if meta_robots:
+            content = meta_robots.get('content', '')
+            result.meta_robots_content = content
+
+            if 'noindex' in content.lower():
+                result.has_noindex_meta = True
+                result.is_indexable = False
+                if not result.noindex_source:
+                    result.noindex_source = 'meta'
+
+        # Also check googlebot-specific meta
+        meta_googlebot = soup.find('meta', attrs={'name': re.compile(r'^googlebot$', re.I)})
+        if meta_googlebot:
+            content = meta_googlebot.get('content', '')
+            if 'noindex' in content.lower():
+                result.has_noindex_meta = True
+                result.is_indexable = False
+                if not result.noindex_source:
+                    result.noindex_source = 'meta'
+
+        return result
+
+
 # Convenience function
 def analyze_html(html: str, base_url: str = '') -> Dict[str, Any]:
     """
@@ -767,20 +1407,139 @@ def analyze_html(html: str, base_url: str = '') -> Dict[str, Any]:
     return result.to_dict()
 
 
+def check_technical_seo(url: str) -> Dict[str, Any]:
+    """
+    Convenience function for technical SEO check.
+
+    Args:
+        url: The URL to check.
+
+    Returns:
+        Dict with technical SEO analysis results.
+    """
+    checker = TechnicalSEOChecker()
+    result = checker.check_url(url)
+    return result.to_dict()
+
+
 if __name__ == '__main__':
     import sys
-    import requests
+    import argparse
 
-    if len(sys.argv) < 2:
-        print("Usage: python seo_analyzer.py <url>")
-        print("Example: python seo_analyzer.py https://pixlab.pl")
-        sys.exit(1)
+    parser = argparse.ArgumentParser(description='SEO Analyzer for websites')
+    parser.add_argument('url', help='URL to analyze')
+    parser.add_argument('--technical', '-t', action='store_true',
+                        help='Run technical SEO checks (robots.txt, sitemap, redirects)')
+    parser.add_argument('--all', '-a', action='store_true',
+                        help='Run both on-page and technical SEO analysis')
+    parser.add_argument('--json', '-j', action='store_true',
+                        help='Output results as JSON')
 
-    test_url = sys.argv[1]
+    args = parser.parse_args()
+    test_url = args.url
 
     print(f"Analyzing: {test_url}")
     print("-" * 60)
 
+    # Run technical SEO checks if requested
+    if args.technical or args.all:
+        print("\n" + "=" * 60)
+        print("TECHNICAL SEO ANALYSIS")
+        print("=" * 60)
+
+        checker = TechnicalSEOChecker()
+        tech_result = checker.check_url(test_url)
+
+        if args.json:
+            print(json.dumps(tech_result.to_dict(), indent=2, default=str))
+        else:
+            print("\n=== ROBOTS.TXT ===")
+            print(f"Exists: {tech_result.robots_txt.exists}")
+            print(f"URL: {tech_result.robots_txt.url}")
+            print(f"Status code: {tech_result.robots_txt.status_code}")
+            if tech_result.robots_txt.exists:
+                print(f"Disallow rules: {len(tech_result.robots_txt.disallow_rules)}")
+                if tech_result.robots_txt.disallow_rules[:5]:
+                    print(f"  Sample: {tech_result.robots_txt.disallow_rules[:5]}")
+                print(f"Sitemap URLs: {tech_result.robots_txt.sitemap_urls}")
+                print(f"Blocks Googlebot: {tech_result.robots_txt.blocks_googlebot}")
+                print(f"Blocks all bots: {tech_result.robots_txt.blocks_all_bots}")
+                if tech_result.robots_txt.crawl_delay:
+                    print(f"Crawl delay: {tech_result.robots_txt.crawl_delay}")
+            if tech_result.robots_txt.errors:
+                print(f"Errors: {tech_result.robots_txt.errors}")
+
+            print("\n=== SITEMAP ===")
+            print(f"Exists: {tech_result.sitemap.exists}")
+            print(f"URL: {tech_result.sitemap.url}")
+            print(f"Status code: {tech_result.sitemap.status_code}")
+            if tech_result.sitemap.exists:
+                print(f"Valid XML: {tech_result.sitemap.is_valid_xml}")
+                print(f"Is sitemap index: {tech_result.sitemap.is_sitemap_index}")
+                if tech_result.sitemap.is_sitemap_index:
+                    print(f"Sitemap count: {tech_result.sitemap.sitemap_count}")
+                else:
+                    print(f"URL count: {tech_result.sitemap.url_count}")
+                if tech_result.sitemap.sample_urls:
+                    print(f"Sample URLs: {tech_result.sitemap.sample_urls[:3]}")
+            if tech_result.sitemap.errors:
+                print(f"Errors: {tech_result.sitemap.errors}")
+
+            print("\n=== REDIRECT CHAIN ===")
+            print(f"Original URL: {tech_result.redirect_chain.original_url}")
+            print(f"Final URL: {tech_result.redirect_chain.final_url}")
+            print(f"Chain length: {tech_result.redirect_chain.chain_length}")
+            if tech_result.redirect_chain.redirects:
+                for i, r in enumerate(tech_result.redirect_chain.redirects[:5]):
+                    print(f"  [{i+1}] {r.status_code}: {r.from_url[:50]}... -> {r.to_url[:50]}...")
+                    if r.is_https_upgrade:
+                        print(f"       (HTTPS upgrade)")
+                    if r.is_www_redirect:
+                        print(f"       (www redirect)")
+            print(f"Has redirect loop: {tech_result.redirect_chain.has_redirect_loop}")
+            print(f"Has mixed content: {tech_result.redirect_chain.has_mixed_content}")
+            print(f"Total time: {tech_result.redirect_chain.total_time_ms}ms")
+            if tech_result.redirect_chain.errors:
+                print(f"Errors: {tech_result.redirect_chain.errors}")
+
+            print("\n=== CANONICAL ===")
+            print(f"Has canonical: {tech_result.canonical.has_canonical}")
+            if tech_result.canonical.has_canonical:
+                print(f"Canonical URL: {tech_result.canonical.canonical_url}")
+                print(f"Is self-referencing: {tech_result.canonical.is_self_referencing}")
+                print(f"Points to different domain: {tech_result.canonical.points_to_different_domain}")
+                print(f"Is relative: {tech_result.canonical.is_relative}")
+                print(f"Is valid URL: {tech_result.canonical.is_valid_url}")
+            if tech_result.canonical.errors:
+                print(f"Errors: {tech_result.canonical.errors}")
+
+            print("\n=== INDEXABILITY ===")
+            print(f"Is indexable: {tech_result.indexability.is_indexable}")
+            print(f"Has noindex meta: {tech_result.indexability.has_noindex_meta}")
+            print(f"Has noindex header: {tech_result.indexability.has_noindex_header}")
+            if tech_result.indexability.noindex_source:
+                print(f"Noindex source: {tech_result.indexability.noindex_source}")
+            if tech_result.indexability.meta_robots_content:
+                print(f"Meta robots: {tech_result.indexability.meta_robots_content}")
+            if tech_result.indexability.x_robots_tag:
+                print(f"X-Robots-Tag: {tech_result.indexability.x_robots_tag}")
+            if tech_result.indexability.errors:
+                print(f"Errors: {tech_result.indexability.errors}")
+
+            if tech_result.errors:
+                print(f"\n=== GENERAL ERRORS ===")
+                for error in tech_result.errors:
+                    print(f"  - {error}")
+
+        # If only technical was requested, exit
+        if not args.all:
+            sys.exit(0)
+
+    # Run on-page analysis (default behavior)
+    print("\n" + "=" * 60)
+    print("ON-PAGE SEO ANALYSIS")
+    print("=" * 60)
+
     # Fetch the page
     try:
         headers = {
@@ -797,65 +1556,68 @@ if __name__ == '__main__':
     analyzer = OnPageSEOAnalyzer()
     result = analyzer.analyze_html(html, test_url)
 
-    # Print results
-    print("\n=== META TAGS ===")
-    print(f"Title: {result.meta_tags.title}")
-    print(f"Title length: {result.meta_tags.title_length}")
-    print(f"Description: {result.meta_tags.description[:100] if result.meta_tags.description else 'None'}...")
-    print(f"Description length: {result.meta_tags.description_length}")
-    print(f"Canonical: {result.meta_tags.canonical_url}")
-    print(f"Robots: {result.meta_tags.robots}")
-    print(f"Viewport: {result.meta_tags.viewport}")
+    if args.json:
+        print(json.dumps(result.to_dict(), indent=2, default=str))
+    else:
+        # Print results
+        print("\n=== META TAGS ===")
+        print(f"Title: {result.meta_tags.title}")
+        print(f"Title length: {result.meta_tags.title_length}")
+        print(f"Description: {result.meta_tags.description[:100] if result.meta_tags.description else 'None'}...")
+        print(f"Description length: {result.meta_tags.description_length}")
+        print(f"Canonical: {result.meta_tags.canonical_url}")
+        print(f"Robots: {result.meta_tags.robots}")
+        print(f"Viewport: {result.meta_tags.viewport}")
 
-    print("\n=== OPEN GRAPH ===")
-    print(f"OG Title: {result.open_graph.og_title}")
-    print(f"OG Description: {result.open_graph.og_description[:100] if result.open_graph.og_description else 'None'}...")
-    print(f"OG Image: {result.open_graph.og_image}")
-    print(f"OG Type: {result.open_graph.og_type}")
+        print("\n=== OPEN GRAPH ===")
+        print(f"OG Title: {result.open_graph.og_title}")
+        print(f"OG Description: {result.open_graph.og_description[:100] if result.open_graph.og_description else 'None'}...")
+        print(f"OG Image: {result.open_graph.og_image}")
+        print(f"OG Type: {result.open_graph.og_type}")
 
-    print("\n=== TWITTER CARD ===")
-    print(f"Card Type: {result.twitter_card.card_type}")
-    print(f"Title: {result.twitter_card.title}")
+        print("\n=== TWITTER CARD ===")
+        print(f"Card Type: {result.twitter_card.card_type}")
+        print(f"Title: {result.twitter_card.title}")
 
-    print("\n=== HEADINGS ===")
-    print(f"H1: {result.headings.h1_count} ({result.headings.h1_texts})")
-    print(f"H2: {result.headings.h2_count}")
-    print(f"H3: {result.headings.h3_count}")
-    print(f"H4: {result.headings.h4_count}")
-    print(f"H5: {result.headings.h5_count}")
-    print(f"H6: {result.headings.h6_count}")
-    print(f"Has single H1: {result.headings.has_single_h1}")
-    print(f"Has proper hierarchy: {result.headings.has_proper_hierarchy}")
-    if result.headings.hierarchy_issues:
-        print(f"Hierarchy issues: {result.headings.hierarchy_issues}")
+        print("\n=== HEADINGS ===")
+        print(f"H1: {result.headings.h1_count} ({result.headings.h1_texts})")
+        print(f"H2: {result.headings.h2_count}")
+        print(f"H3: {result.headings.h3_count}")
+        print(f"H4: {result.headings.h4_count}")
+        print(f"H5: {result.headings.h5_count}")
+        print(f"H6: {result.headings.h6_count}")
+        print(f"Has single H1: {result.headings.has_single_h1}")
+        print(f"Has proper hierarchy: {result.headings.has_proper_hierarchy}")
+        if result.headings.hierarchy_issues:
+            print(f"Hierarchy issues: {result.headings.hierarchy_issues}")
 
-    print("\n=== IMAGES ===")
-    print(f"Total images: {result.images.total_images}")
-    print(f"With alt: {result.images.images_with_alt}")
-    print(f"Without alt: {result.images.images_without_alt}")
-    print(f"With empty alt: {result.images.images_with_empty_alt}")
-    if result.images.alt_text_quality_issues:
-        print(f"Alt quality issues: {len(result.images.alt_text_quality_issues)}")
+        print("\n=== IMAGES ===")
+        print(f"Total images: {result.images.total_images}")
+        print(f"With alt: {result.images.images_with_alt}")
+        print(f"Without alt: {result.images.images_without_alt}")
+        print(f"With empty alt: {result.images.images_with_empty_alt}")
+        if result.images.alt_text_quality_issues:
+            print(f"Alt quality issues: {len(result.images.alt_text_quality_issues)}")
 
-    print("\n=== LINKS ===")
-    print(f"Total links: {result.links.total_links}")
-    print(f"Internal: {result.links.internal_links}")
-    print(f"External: {result.links.external_links}")
-    print(f"Nofollow: {result.links.nofollow_links}")
-    print(f"Broken anchor links: {result.links.broken_anchor_links}")
-    print(f"External domains: {result.links.unique_external_domains[:5]}")
+        print("\n=== LINKS ===")
+        print(f"Total links: {result.links.total_links}")
+        print(f"Internal: {result.links.internal_links}")
+        print(f"External: {result.links.external_links}")
+        print(f"Nofollow: {result.links.nofollow_links}")
+        print(f"Broken anchor links: {result.links.broken_anchor_links}")
+        print(f"External domains: {result.links.unique_external_domains[:5]}")
 
-    print("\n=== STRUCTURED DATA ===")
-    print(f"Has structured data: {result.structured_data.has_structured_data}")
-    print(f"JSON-LD count: {result.structured_data.json_ld_count}")
-    print(f"Microdata count: {result.structured_data.microdata_count}")
-    print(f"RDFa count: {result.structured_data.rdfa_count}")
-    print(f"Schema types: {result.structured_data.all_types}")
+        print("\n=== STRUCTURED DATA ===")
+        print(f"Has structured data: {result.structured_data.has_structured_data}")
+        print(f"JSON-LD count: {result.structured_data.json_ld_count}")
+        print(f"Microdata count: {result.structured_data.microdata_count}")
+        print(f"RDFa count: {result.structured_data.rdfa_count}")
+        print(f"Schema types: {result.structured_data.all_types}")
 
-    print("\n=== OTHER ===")
-    print(f"Word count: {result.word_count}")
-    print(f"Has DOCTYPE: {result.has_doctype}")
-    print(f"Has lang attribute: {result.has_lang_attribute} ({result.lang_attribute})")
+        print("\n=== OTHER ===")
+        print(f"Word count: {result.word_count}")
+        print(f"Has DOCTYPE: {result.has_doctype}")
+        print(f"Has lang attribute: {result.has_lang_attribute} ({result.lang_attribute})")
 
-    if result.errors:
-        print(f"\nErrors: {result.errors}")
+        if result.errors:
+            print(f"\nErrors: {result.errors}")