From 81fc27dfa9e02385d9b2bc26c4ca13af585609a5 Mon Sep 17 00:00:00 2001 From: Maciej Pienczyn Date: Thu, 8 Jan 2026 02:12:47 +0100 Subject: [PATCH] auto-claude: 3.2 - Add TechnicalSEOChecker class to scripts/seo_analyzer.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds TechnicalSEOChecker class that performs technical SEO audits: - robots.txt: checks existence, parses directives (Disallow, Allow, Sitemap) detects if blocks Googlebot or all bots - sitemap.xml: checks existence, validates XML, counts URLs, detects sitemap index - Canonical URLs: detects canonical tag, checks if self-referencing or cross-domain - Noindex tags: checks meta robots and X-Robots-Tag HTTP header - Redirect chains: follows up to 10 redirects, detects loops, HTTPS upgrades, www redirects, and mixed content issues Includes: - 8 dataclasses for structured results (RobotsTxtResult, SitemapResult, etc.) - TechnicalSEOResult container for complete analysis - check_technical_seo() convenience function - CLI support: --technical/-t flag for technical-only analysis - --all/-a flag for combined on-page and technical analysis - --json/-j flag for JSON output 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- scripts/seo_analyzer.py | 886 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 824 insertions(+), 62 deletions(-) diff --git a/scripts/seo_analyzer.py b/scripts/seo_analyzer.py index 22ea9b1..6de4c62 100644 --- a/scripts/seo_analyzer.py +++ b/scripts/seo_analyzer.py @@ -11,12 +11,24 @@ Analyzes HTML content for SEO factors including: - Structured data detection (JSON-LD, Microdata, RDFa) - Open Graph and Twitter Card metadata -Usage: - from seo_analyzer import OnPageSEOAnalyzer +Also includes TechnicalSEOChecker for: +- robots.txt analysis +- sitemap.xml validation +- Canonical URL verification +- Noindex tag detection +- Redirect chain analysis +Usage: + from seo_analyzer import OnPageSEOAnalyzer, TechnicalSEOChecker + + # On-page analysis analyzer = OnPageSEOAnalyzer() result = analyzer.analyze_html(html_content, base_url='https://example.com') + # Technical SEO checks + checker = TechnicalSEOChecker() + tech_result = checker.check_url('https://example.com') + Author: Claude Code Date: 2026-01-08 """ @@ -24,10 +36,13 @@ Date: 2026-01-08 import json import re import logging +import time +import xml.etree.ElementTree as ET from typing import Optional, Dict, List, Any, Tuple from dataclasses import dataclass, field, asdict from urllib.parse import urlparse, urljoin +import requests from bs4 import BeautifulSoup, Comment # Configure logging @@ -750,6 +765,631 @@ class OnPageSEOAnalyzer: return 0 +# ============================================================================= +# Technical SEO Checker +# ============================================================================= + +# Request configuration for TechnicalSEOChecker +REQUEST_TIMEOUT = 15 +USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 NordaBiznes-SEO-Checker/1.0' + +# Maximum redirects to follow +MAX_REDIRECTS = 10 + + +@dataclass +class RobotsTxtResult: + """Analysis of robots.txt file.""" + exists: bool = False + url: Optional[str] = None + status_code: Optional[int] = None + content: Optional[str] = None + content_length: Optional[int] = None + disallow_rules: List[str] = field(default_factory=list) + allow_rules: List[str] = field(default_factory=list) + sitemap_urls: List[str] = field(default_factory=list) + crawl_delay: Optional[float] = None + blocks_googlebot: bool = False + blocks_all_bots: bool = False + errors: List[str] = field(default_factory=list) + + def to_dict(self) -> Dict[str, Any]: + return asdict(self) + + +@dataclass +class SitemapResult: + """Analysis of sitemap.xml file.""" + exists: bool = False + url: Optional[str] = None + status_code: Optional[int] = None + is_valid_xml: bool = False + is_sitemap_index: bool = False + url_count: int = 0 + sitemap_count: int = 0 # For sitemap index + sample_urls: List[str] = field(default_factory=list) + last_modified: Optional[str] = None + content_length: Optional[int] = None + errors: List[str] = field(default_factory=list) + + def to_dict(self) -> Dict[str, Any]: + return asdict(self) + + +@dataclass +class RedirectInfo: + """Information about a single redirect.""" + from_url: str + to_url: str + status_code: int + is_https_upgrade: bool = False + is_www_redirect: bool = False + + def to_dict(self) -> Dict[str, Any]: + return asdict(self) + + +@dataclass +class RedirectChainResult: + """Analysis of redirect chain for a URL.""" + original_url: str + final_url: str + chain_length: int = 0 + redirects: List[RedirectInfo] = field(default_factory=list) + has_redirect_loop: bool = False + has_mixed_content: bool = False # HTTP -> HTTPS -> HTTP + total_time_ms: Optional[int] = None + errors: List[str] = field(default_factory=list) + + def to_dict(self) -> Dict[str, Any]: + result = asdict(self) + result['redirects'] = [r.to_dict() if hasattr(r, 'to_dict') else r for r in self.redirects] + return result + + +@dataclass +class CanonicalResult: + """Analysis of canonical URL configuration.""" + has_canonical: bool = False + canonical_url: Optional[str] = None + is_self_referencing: bool = False + points_to_different_domain: bool = False + is_relative: bool = False + is_valid_url: bool = False + matches_current_url: bool = False + errors: List[str] = field(default_factory=list) + + def to_dict(self) -> Dict[str, Any]: + return asdict(self) + + +@dataclass +class IndexabilityResult: + """Analysis of page indexability.""" + is_indexable: bool = True + has_noindex_meta: bool = False + has_noindex_header: bool = False + noindex_source: Optional[str] = None # 'meta', 'header', 'robots.txt' + meta_robots_content: Optional[str] = None + x_robots_tag: Optional[str] = None + errors: List[str] = field(default_factory=list) + + def to_dict(self) -> Dict[str, Any]: + return asdict(self) + + +@dataclass +class TechnicalSEOResult: + """Complete technical SEO check result.""" + url: str + checked_at: str + robots_txt: RobotsTxtResult + sitemap: SitemapResult + redirect_chain: RedirectChainResult + canonical: CanonicalResult + indexability: IndexabilityResult + errors: List[str] = field(default_factory=list) + + def to_dict(self) -> Dict[str, Any]: + return { + 'url': self.url, + 'checked_at': self.checked_at, + 'robots_txt': self.robots_txt.to_dict(), + 'sitemap': self.sitemap.to_dict(), + 'redirect_chain': self.redirect_chain.to_dict(), + 'canonical': self.canonical.to_dict(), + 'indexability': self.indexability.to_dict(), + 'errors': self.errors, + } + + +class TechnicalSEOChecker: + """ + Checks technical SEO factors for a website. + + Analyzes: + - robots.txt presence and configuration + - sitemap.xml presence and validity + - Canonical URL configuration + - Noindex tags (meta and HTTP header) + - Redirect chains + + Usage: + checker = TechnicalSEOChecker() + result = checker.check_url('https://example.com') + + # Access specific results + print(f"robots.txt exists: {result.robots_txt.exists}") + print(f"sitemap.xml exists: {result.sitemap.exists}") + print(f"Redirect chain length: {result.redirect_chain.chain_length}") + print(f"Is indexable: {result.indexability.is_indexable}") + """ + + def __init__(self, timeout: int = REQUEST_TIMEOUT): + """ + Initialize the TechnicalSEOChecker. + + Args: + timeout: Request timeout in seconds. + """ + self.timeout = timeout + self.session = requests.Session() + self.session.headers.update({'User-Agent': USER_AGENT}) + + def check_url(self, url: str) -> TechnicalSEOResult: + """ + Perform complete technical SEO check for a URL. + + Args: + url: The URL to check. + + Returns: + TechnicalSEOResult with all technical SEO analysis. + """ + from datetime import datetime + + errors = [] + + # Normalize URL + if not url.startswith(('http://', 'https://')): + url = 'https://' + url + + parsed = urlparse(url) + base_url = f"{parsed.scheme}://{parsed.netloc}" + + # Check robots.txt + robots_result = self.check_robots_txt(base_url) + + # Check sitemap.xml (use sitemap from robots.txt if available) + sitemap_urls = robots_result.sitemap_urls if robots_result.sitemap_urls else [f"{base_url}/sitemap.xml"] + sitemap_result = self.check_sitemap(sitemap_urls[0] if sitemap_urls else f"{base_url}/sitemap.xml") + + # Check redirect chain + redirect_result = self.check_redirect_chain(url) + + # Fetch page for canonical and indexability checks + canonical_result = CanonicalResult() + indexability_result = IndexabilityResult() + + try: + response = self.session.get(url, timeout=self.timeout, allow_redirects=True) + final_url = response.url + + # Parse HTML for canonical and noindex + if response.status_code == 200: + canonical_result = self._check_canonical(response.text, final_url) + indexability_result = self._check_indexability(response) + else: + errors.append(f"HTTP {response.status_code} when fetching page") + + except requests.exceptions.Timeout: + errors.append(f"Timeout fetching {url}") + except requests.exceptions.ConnectionError as e: + errors.append(f"Connection error: {str(e)[:100]}") + except requests.exceptions.RequestException as e: + errors.append(f"Request error: {str(e)[:100]}") + + return TechnicalSEOResult( + url=url, + checked_at=datetime.now().isoformat(), + robots_txt=robots_result, + sitemap=sitemap_result, + redirect_chain=redirect_result, + canonical=canonical_result, + indexability=indexability_result, + errors=errors, + ) + + def check_robots_txt(self, base_url: str) -> RobotsTxtResult: + """ + Check robots.txt file for a domain. + + Args: + base_url: Base URL of the site (e.g., 'https://example.com'). + + Returns: + RobotsTxtResult with robots.txt analysis. + """ + result = RobotsTxtResult() + robots_url = f"{base_url.rstrip('/')}/robots.txt" + result.url = robots_url + + try: + response = self.session.get(robots_url, timeout=self.timeout) + result.status_code = response.status_code + + if response.status_code == 200: + result.exists = True + result.content = response.text + result.content_length = len(response.text) + + # Parse robots.txt + self._parse_robots_txt(response.text, result) + elif response.status_code == 404: + result.exists = False + else: + result.errors.append(f"Unexpected status code: {response.status_code}") + + except requests.exceptions.Timeout: + result.errors.append("Timeout fetching robots.txt") + except requests.exceptions.ConnectionError as e: + result.errors.append(f"Connection error: {str(e)[:100]}") + except requests.exceptions.RequestException as e: + result.errors.append(f"Request error: {str(e)[:100]}") + + return result + + def _parse_robots_txt(self, content: str, result: RobotsTxtResult) -> None: + """Parse robots.txt content and populate result.""" + current_user_agent = None + is_googlebot_section = False + is_all_section = False + + for line in content.split('\n'): + line = line.strip() + + # Skip empty lines and comments + if not line or line.startswith('#'): + continue + + # Split on first colon + if ':' not in line: + continue + + directive, value = line.split(':', 1) + directive = directive.strip().lower() + value = value.strip() + + if directive == 'user-agent': + current_user_agent = value.lower() + is_googlebot_section = 'googlebot' in current_user_agent + is_all_section = current_user_agent == '*' + + elif directive == 'disallow' and value: + result.disallow_rules.append(value) + # Check if blocking important paths + if value == '/' and (is_googlebot_section or is_all_section): + if is_googlebot_section: + result.blocks_googlebot = True + if is_all_section: + result.blocks_all_bots = True + + elif directive == 'allow' and value: + result.allow_rules.append(value) + + elif directive == 'sitemap': + if value and value not in result.sitemap_urls: + result.sitemap_urls.append(value) + + elif directive == 'crawl-delay': + try: + result.crawl_delay = float(value) + except ValueError: + pass + + # Deduplicate + result.disallow_rules = list(dict.fromkeys(result.disallow_rules)) + result.allow_rules = list(dict.fromkeys(result.allow_rules)) + + def check_sitemap(self, sitemap_url: str) -> SitemapResult: + """ + Check sitemap.xml file. + + Args: + sitemap_url: URL of the sitemap. + + Returns: + SitemapResult with sitemap analysis. + """ + result = SitemapResult() + result.url = sitemap_url + + try: + response = self.session.get(sitemap_url, timeout=self.timeout) + result.status_code = response.status_code + + if response.status_code == 200: + result.exists = True + result.content_length = len(response.content) + + # Check Last-Modified header + last_modified = response.headers.get('Last-Modified') + if last_modified: + result.last_modified = last_modified + + # Parse XML + self._parse_sitemap(response.content, result) + + elif response.status_code == 404: + result.exists = False + else: + result.errors.append(f"Unexpected status code: {response.status_code}") + + except requests.exceptions.Timeout: + result.errors.append("Timeout fetching sitemap") + except requests.exceptions.ConnectionError as e: + result.errors.append(f"Connection error: {str(e)[:100]}") + except requests.exceptions.RequestException as e: + result.errors.append(f"Request error: {str(e)[:100]}") + + return result + + def _parse_sitemap(self, content: bytes, result: SitemapResult) -> None: + """Parse sitemap XML content and populate result.""" + try: + # Try to parse as XML + root = ET.fromstring(content) + result.is_valid_xml = True + + # Check namespace (handle both with and without namespace) + ns = {'sm': 'http://www.sitemaps.org/schemas/sitemap/0.9'} + + # Check if it's a sitemap index + sitemap_tags = root.findall('.//sm:sitemap', ns) or root.findall('.//sitemap') + if sitemap_tags: + result.is_sitemap_index = True + result.sitemap_count = len(sitemap_tags) + + # Get sample sitemap URLs + for sitemap_tag in sitemap_tags[:5]: + loc = sitemap_tag.find('sm:loc', ns) or sitemap_tag.find('loc') + if loc is not None and loc.text: + result.sample_urls.append(loc.text) + else: + # Regular sitemap + url_tags = root.findall('.//sm:url', ns) or root.findall('.//url') + result.url_count = len(url_tags) + + # Get sample URLs + for url_tag in url_tags[:10]: + loc = url_tag.find('sm:loc', ns) or url_tag.find('loc') + if loc is not None and loc.text: + result.sample_urls.append(loc.text) + + except ET.ParseError as e: + result.is_valid_xml = False + result.errors.append(f"Invalid XML: {str(e)[:100]}") + except Exception as e: + result.errors.append(f"Error parsing sitemap: {str(e)[:100]}") + + def check_redirect_chain(self, url: str) -> RedirectChainResult: + """ + Check redirect chain for a URL. + + Args: + url: The URL to check. + + Returns: + RedirectChainResult with redirect chain analysis. + """ + result = RedirectChainResult(original_url=url, final_url=url) + visited_urls = set() + current_url = url + start_time = time.time() + + for i in range(MAX_REDIRECTS): + if current_url in visited_urls: + result.has_redirect_loop = True + result.errors.append(f"Redirect loop detected at: {current_url}") + break + + visited_urls.add(current_url) + + try: + response = self.session.get( + current_url, + timeout=self.timeout, + allow_redirects=False + ) + + # Check for redirect + if response.status_code in (301, 302, 303, 307, 308): + next_url = response.headers.get('Location') + if not next_url: + result.errors.append("Redirect without Location header") + break + + # Handle relative redirects + if not next_url.startswith(('http://', 'https://')): + parsed = urlparse(current_url) + if next_url.startswith('/'): + next_url = f"{parsed.scheme}://{parsed.netloc}{next_url}" + else: + next_url = urljoin(current_url, next_url) + + # Create redirect info + parsed_from = urlparse(current_url) + parsed_to = urlparse(next_url) + + redirect_info = RedirectInfo( + from_url=current_url, + to_url=next_url, + status_code=response.status_code, + is_https_upgrade=( + parsed_from.scheme == 'http' and + parsed_to.scheme == 'https' and + parsed_from.netloc.replace('www.', '') == parsed_to.netloc.replace('www.', '') + ), + is_www_redirect=( + parsed_from.netloc.replace('www.', '') == parsed_to.netloc.replace('www.', '') and + parsed_from.netloc != parsed_to.netloc + ) + ) + result.redirects.append(redirect_info) + + # Check for mixed content + if len(result.redirects) >= 2: + schemes = [urlparse(r.from_url).scheme for r in result.redirects] + schemes.append(parsed_to.scheme) + if 'http' in schemes and 'https' in schemes: + if schemes.index('https') < len(schemes) - 1 and 'http' in schemes[schemes.index('https'):]: + result.has_mixed_content = True + + current_url = next_url + + else: + # No more redirects + result.final_url = current_url + break + + except requests.exceptions.Timeout: + result.errors.append(f"Timeout at: {current_url}") + break + except requests.exceptions.ConnectionError as e: + result.errors.append(f"Connection error at {current_url}: {str(e)[:50]}") + break + except requests.exceptions.RequestException as e: + result.errors.append(f"Request error: {str(e)[:100]}") + break + + result.chain_length = len(result.redirects) + result.total_time_ms = int((time.time() - start_time) * 1000) + + return result + + def _check_canonical(self, html: str, current_url: str) -> CanonicalResult: + """ + Check canonical URL configuration from HTML. + + Args: + html: HTML content of the page. + current_url: Current URL of the page. + + Returns: + CanonicalResult with canonical URL analysis. + """ + result = CanonicalResult() + + try: + soup = BeautifulSoup(html, 'lxml') + except Exception: + try: + soup = BeautifulSoup(html, 'html.parser') + except Exception as e: + result.errors.append(f"Failed to parse HTML: {str(e)[:100]}") + return result + + # Find canonical link + canonical_tag = soup.find('link', rel='canonical') + + if canonical_tag: + result.has_canonical = True + canonical_url = canonical_tag.get('href', '') + result.canonical_url = canonical_url + + if canonical_url: + # Check if relative + result.is_relative = not canonical_url.startswith(('http://', 'https://')) + + # Parse canonical URL + if result.is_relative: + # Make it absolute for comparison + parsed_current = urlparse(current_url) + if canonical_url.startswith('/'): + canonical_abs = f"{parsed_current.scheme}://{parsed_current.netloc}{canonical_url}" + else: + canonical_abs = urljoin(current_url, canonical_url) + else: + canonical_abs = canonical_url + + parsed_canonical = urlparse(canonical_abs) + parsed_current = urlparse(current_url) + + # Check if valid URL + result.is_valid_url = bool(parsed_canonical.scheme and parsed_canonical.netloc) + + # Check if self-referencing + result.is_self_referencing = ( + parsed_canonical.netloc.replace('www.', '') == parsed_current.netloc.replace('www.', '') and + parsed_canonical.path == parsed_current.path + ) + + # Check if points to different domain + result.points_to_different_domain = ( + parsed_canonical.netloc.replace('www.', '') != parsed_current.netloc.replace('www.', '') + ) + + # Check if matches current URL exactly + result.matches_current_url = (canonical_abs.rstrip('/') == current_url.rstrip('/')) + + return result + + def _check_indexability(self, response: requests.Response) -> IndexabilityResult: + """ + Check if page is indexable based on meta tags and HTTP headers. + + Args: + response: Response object from fetching the page. + + Returns: + IndexabilityResult with indexability analysis. + """ + result = IndexabilityResult() + + # Check X-Robots-Tag HTTP header + x_robots = response.headers.get('X-Robots-Tag', '') + if x_robots: + result.x_robots_tag = x_robots + if 'noindex' in x_robots.lower(): + result.has_noindex_header = True + result.is_indexable = False + result.noindex_source = 'header' + + # Check meta robots tag in HTML + try: + soup = BeautifulSoup(response.text, 'lxml') + except Exception: + try: + soup = BeautifulSoup(response.text, 'html.parser') + except Exception as e: + result.errors.append(f"Failed to parse HTML: {str(e)[:100]}") + return result + + # Find meta robots + meta_robots = soup.find('meta', attrs={'name': re.compile(r'^robots$', re.I)}) + if meta_robots: + content = meta_robots.get('content', '') + result.meta_robots_content = content + + if 'noindex' in content.lower(): + result.has_noindex_meta = True + result.is_indexable = False + if not result.noindex_source: + result.noindex_source = 'meta' + + # Also check googlebot-specific meta + meta_googlebot = soup.find('meta', attrs={'name': re.compile(r'^googlebot$', re.I)}) + if meta_googlebot: + content = meta_googlebot.get('content', '') + if 'noindex' in content.lower(): + result.has_noindex_meta = True + result.is_indexable = False + if not result.noindex_source: + result.noindex_source = 'meta' + + return result + + # Convenience function def analyze_html(html: str, base_url: str = '') -> Dict[str, Any]: """ @@ -767,20 +1407,139 @@ def analyze_html(html: str, base_url: str = '') -> Dict[str, Any]: return result.to_dict() +def check_technical_seo(url: str) -> Dict[str, Any]: + """ + Convenience function for technical SEO check. + + Args: + url: The URL to check. + + Returns: + Dict with technical SEO analysis results. + """ + checker = TechnicalSEOChecker() + result = checker.check_url(url) + return result.to_dict() + + if __name__ == '__main__': import sys - import requests + import argparse - if len(sys.argv) < 2: - print("Usage: python seo_analyzer.py ") - print("Example: python seo_analyzer.py https://pixlab.pl") - sys.exit(1) + parser = argparse.ArgumentParser(description='SEO Analyzer for websites') + parser.add_argument('url', help='URL to analyze') + parser.add_argument('--technical', '-t', action='store_true', + help='Run technical SEO checks (robots.txt, sitemap, redirects)') + parser.add_argument('--all', '-a', action='store_true', + help='Run both on-page and technical SEO analysis') + parser.add_argument('--json', '-j', action='store_true', + help='Output results as JSON') - test_url = sys.argv[1] + args = parser.parse_args() + test_url = args.url print(f"Analyzing: {test_url}") print("-" * 60) + # Run technical SEO checks if requested + if args.technical or args.all: + print("\n" + "=" * 60) + print("TECHNICAL SEO ANALYSIS") + print("=" * 60) + + checker = TechnicalSEOChecker() + tech_result = checker.check_url(test_url) + + if args.json: + print(json.dumps(tech_result.to_dict(), indent=2, default=str)) + else: + print("\n=== ROBOTS.TXT ===") + print(f"Exists: {tech_result.robots_txt.exists}") + print(f"URL: {tech_result.robots_txt.url}") + print(f"Status code: {tech_result.robots_txt.status_code}") + if tech_result.robots_txt.exists: + print(f"Disallow rules: {len(tech_result.robots_txt.disallow_rules)}") + if tech_result.robots_txt.disallow_rules[:5]: + print(f" Sample: {tech_result.robots_txt.disallow_rules[:5]}") + print(f"Sitemap URLs: {tech_result.robots_txt.sitemap_urls}") + print(f"Blocks Googlebot: {tech_result.robots_txt.blocks_googlebot}") + print(f"Blocks all bots: {tech_result.robots_txt.blocks_all_bots}") + if tech_result.robots_txt.crawl_delay: + print(f"Crawl delay: {tech_result.robots_txt.crawl_delay}") + if tech_result.robots_txt.errors: + print(f"Errors: {tech_result.robots_txt.errors}") + + print("\n=== SITEMAP ===") + print(f"Exists: {tech_result.sitemap.exists}") + print(f"URL: {tech_result.sitemap.url}") + print(f"Status code: {tech_result.sitemap.status_code}") + if tech_result.sitemap.exists: + print(f"Valid XML: {tech_result.sitemap.is_valid_xml}") + print(f"Is sitemap index: {tech_result.sitemap.is_sitemap_index}") + if tech_result.sitemap.is_sitemap_index: + print(f"Sitemap count: {tech_result.sitemap.sitemap_count}") + else: + print(f"URL count: {tech_result.sitemap.url_count}") + if tech_result.sitemap.sample_urls: + print(f"Sample URLs: {tech_result.sitemap.sample_urls[:3]}") + if tech_result.sitemap.errors: + print(f"Errors: {tech_result.sitemap.errors}") + + print("\n=== REDIRECT CHAIN ===") + print(f"Original URL: {tech_result.redirect_chain.original_url}") + print(f"Final URL: {tech_result.redirect_chain.final_url}") + print(f"Chain length: {tech_result.redirect_chain.chain_length}") + if tech_result.redirect_chain.redirects: + for i, r in enumerate(tech_result.redirect_chain.redirects[:5]): + print(f" [{i+1}] {r.status_code}: {r.from_url[:50]}... -> {r.to_url[:50]}...") + if r.is_https_upgrade: + print(f" (HTTPS upgrade)") + if r.is_www_redirect: + print(f" (www redirect)") + print(f"Has redirect loop: {tech_result.redirect_chain.has_redirect_loop}") + print(f"Has mixed content: {tech_result.redirect_chain.has_mixed_content}") + print(f"Total time: {tech_result.redirect_chain.total_time_ms}ms") + if tech_result.redirect_chain.errors: + print(f"Errors: {tech_result.redirect_chain.errors}") + + print("\n=== CANONICAL ===") + print(f"Has canonical: {tech_result.canonical.has_canonical}") + if tech_result.canonical.has_canonical: + print(f"Canonical URL: {tech_result.canonical.canonical_url}") + print(f"Is self-referencing: {tech_result.canonical.is_self_referencing}") + print(f"Points to different domain: {tech_result.canonical.points_to_different_domain}") + print(f"Is relative: {tech_result.canonical.is_relative}") + print(f"Is valid URL: {tech_result.canonical.is_valid_url}") + if tech_result.canonical.errors: + print(f"Errors: {tech_result.canonical.errors}") + + print("\n=== INDEXABILITY ===") + print(f"Is indexable: {tech_result.indexability.is_indexable}") + print(f"Has noindex meta: {tech_result.indexability.has_noindex_meta}") + print(f"Has noindex header: {tech_result.indexability.has_noindex_header}") + if tech_result.indexability.noindex_source: + print(f"Noindex source: {tech_result.indexability.noindex_source}") + if tech_result.indexability.meta_robots_content: + print(f"Meta robots: {tech_result.indexability.meta_robots_content}") + if tech_result.indexability.x_robots_tag: + print(f"X-Robots-Tag: {tech_result.indexability.x_robots_tag}") + if tech_result.indexability.errors: + print(f"Errors: {tech_result.indexability.errors}") + + if tech_result.errors: + print(f"\n=== GENERAL ERRORS ===") + for error in tech_result.errors: + print(f" - {error}") + + # If only technical was requested, exit + if not args.all: + sys.exit(0) + + # Run on-page analysis (default behavior) + print("\n" + "=" * 60) + print("ON-PAGE SEO ANALYSIS") + print("=" * 60) + # Fetch the page try: headers = { @@ -797,65 +1556,68 @@ if __name__ == '__main__': analyzer = OnPageSEOAnalyzer() result = analyzer.analyze_html(html, test_url) - # Print results - print("\n=== META TAGS ===") - print(f"Title: {result.meta_tags.title}") - print(f"Title length: {result.meta_tags.title_length}") - print(f"Description: {result.meta_tags.description[:100] if result.meta_tags.description else 'None'}...") - print(f"Description length: {result.meta_tags.description_length}") - print(f"Canonical: {result.meta_tags.canonical_url}") - print(f"Robots: {result.meta_tags.robots}") - print(f"Viewport: {result.meta_tags.viewport}") + if args.json: + print(json.dumps(result.to_dict(), indent=2, default=str)) + else: + # Print results + print("\n=== META TAGS ===") + print(f"Title: {result.meta_tags.title}") + print(f"Title length: {result.meta_tags.title_length}") + print(f"Description: {result.meta_tags.description[:100] if result.meta_tags.description else 'None'}...") + print(f"Description length: {result.meta_tags.description_length}") + print(f"Canonical: {result.meta_tags.canonical_url}") + print(f"Robots: {result.meta_tags.robots}") + print(f"Viewport: {result.meta_tags.viewport}") - print("\n=== OPEN GRAPH ===") - print(f"OG Title: {result.open_graph.og_title}") - print(f"OG Description: {result.open_graph.og_description[:100] if result.open_graph.og_description else 'None'}...") - print(f"OG Image: {result.open_graph.og_image}") - print(f"OG Type: {result.open_graph.og_type}") + print("\n=== OPEN GRAPH ===") + print(f"OG Title: {result.open_graph.og_title}") + print(f"OG Description: {result.open_graph.og_description[:100] if result.open_graph.og_description else 'None'}...") + print(f"OG Image: {result.open_graph.og_image}") + print(f"OG Type: {result.open_graph.og_type}") - print("\n=== TWITTER CARD ===") - print(f"Card Type: {result.twitter_card.card_type}") - print(f"Title: {result.twitter_card.title}") + print("\n=== TWITTER CARD ===") + print(f"Card Type: {result.twitter_card.card_type}") + print(f"Title: {result.twitter_card.title}") - print("\n=== HEADINGS ===") - print(f"H1: {result.headings.h1_count} ({result.headings.h1_texts})") - print(f"H2: {result.headings.h2_count}") - print(f"H3: {result.headings.h3_count}") - print(f"H4: {result.headings.h4_count}") - print(f"H5: {result.headings.h5_count}") - print(f"H6: {result.headings.h6_count}") - print(f"Has single H1: {result.headings.has_single_h1}") - print(f"Has proper hierarchy: {result.headings.has_proper_hierarchy}") - if result.headings.hierarchy_issues: - print(f"Hierarchy issues: {result.headings.hierarchy_issues}") + print("\n=== HEADINGS ===") + print(f"H1: {result.headings.h1_count} ({result.headings.h1_texts})") + print(f"H2: {result.headings.h2_count}") + print(f"H3: {result.headings.h3_count}") + print(f"H4: {result.headings.h4_count}") + print(f"H5: {result.headings.h5_count}") + print(f"H6: {result.headings.h6_count}") + print(f"Has single H1: {result.headings.has_single_h1}") + print(f"Has proper hierarchy: {result.headings.has_proper_hierarchy}") + if result.headings.hierarchy_issues: + print(f"Hierarchy issues: {result.headings.hierarchy_issues}") - print("\n=== IMAGES ===") - print(f"Total images: {result.images.total_images}") - print(f"With alt: {result.images.images_with_alt}") - print(f"Without alt: {result.images.images_without_alt}") - print(f"With empty alt: {result.images.images_with_empty_alt}") - if result.images.alt_text_quality_issues: - print(f"Alt quality issues: {len(result.images.alt_text_quality_issues)}") + print("\n=== IMAGES ===") + print(f"Total images: {result.images.total_images}") + print(f"With alt: {result.images.images_with_alt}") + print(f"Without alt: {result.images.images_without_alt}") + print(f"With empty alt: {result.images.images_with_empty_alt}") + if result.images.alt_text_quality_issues: + print(f"Alt quality issues: {len(result.images.alt_text_quality_issues)}") - print("\n=== LINKS ===") - print(f"Total links: {result.links.total_links}") - print(f"Internal: {result.links.internal_links}") - print(f"External: {result.links.external_links}") - print(f"Nofollow: {result.links.nofollow_links}") - print(f"Broken anchor links: {result.links.broken_anchor_links}") - print(f"External domains: {result.links.unique_external_domains[:5]}") + print("\n=== LINKS ===") + print(f"Total links: {result.links.total_links}") + print(f"Internal: {result.links.internal_links}") + print(f"External: {result.links.external_links}") + print(f"Nofollow: {result.links.nofollow_links}") + print(f"Broken anchor links: {result.links.broken_anchor_links}") + print(f"External domains: {result.links.unique_external_domains[:5]}") - print("\n=== STRUCTURED DATA ===") - print(f"Has structured data: {result.structured_data.has_structured_data}") - print(f"JSON-LD count: {result.structured_data.json_ld_count}") - print(f"Microdata count: {result.structured_data.microdata_count}") - print(f"RDFa count: {result.structured_data.rdfa_count}") - print(f"Schema types: {result.structured_data.all_types}") + print("\n=== STRUCTURED DATA ===") + print(f"Has structured data: {result.structured_data.has_structured_data}") + print(f"JSON-LD count: {result.structured_data.json_ld_count}") + print(f"Microdata count: {result.structured_data.microdata_count}") + print(f"RDFa count: {result.structured_data.rdfa_count}") + print(f"Schema types: {result.structured_data.all_types}") - print("\n=== OTHER ===") - print(f"Word count: {result.word_count}") - print(f"Has DOCTYPE: {result.has_doctype}") - print(f"Has lang attribute: {result.has_lang_attribute} ({result.lang_attribute})") + print("\n=== OTHER ===") + print(f"Word count: {result.word_count}") + print(f"Has DOCTYPE: {result.has_doctype}") + print(f"Has lang attribute: {result.has_lang_attribute} ({result.lang_attribute})") - if result.errors: - print(f"\nErrors: {result.errors}") + if result.errors: + print(f"\nErrors: {result.errors}")