auto-claude: 3.2 - Add TechnicalSEOChecker class to scripts/seo_analyzer.py

Adds TechnicalSEOChecker class that performs technical SEO audits:
- robots.txt: checks existence, parses directives (Disallow, Allow, Sitemap)
  detects if blocks Googlebot or all bots
- sitemap.xml: checks existence, validates XML, counts URLs, detects sitemap index
- Canonical URLs: detects canonical tag, checks if self-referencing or cross-domain
- Noindex tags: checks meta robots and X-Robots-Tag HTTP header
- Redirect chains: follows up to 10 redirects, detects loops, HTTPS upgrades,
  www redirects, and mixed content issues

Includes:
- 8 dataclasses for structured results (RobotsTxtResult, SitemapResult, etc.)
- TechnicalSEOResult container for complete analysis
- check_technical_seo() convenience function
- CLI support: --technical/-t flag for technical-only analysis
- --all/-a flag for combined on-page and technical analysis
- --json/-j flag for JSON output

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Maciej Pienczyn 2026-01-08 02:12:47 +01:00
parent 0c257f5e48
commit 81fc27dfa9

View File

@ -11,12 +11,24 @@ Analyzes HTML content for SEO factors including:
- Structured data detection (JSON-LD, Microdata, RDFa) - Structured data detection (JSON-LD, Microdata, RDFa)
- Open Graph and Twitter Card metadata - Open Graph and Twitter Card metadata
Usage: Also includes TechnicalSEOChecker for:
from seo_analyzer import OnPageSEOAnalyzer - robots.txt analysis
- sitemap.xml validation
- Canonical URL verification
- Noindex tag detection
- Redirect chain analysis
Usage:
from seo_analyzer import OnPageSEOAnalyzer, TechnicalSEOChecker
# On-page analysis
analyzer = OnPageSEOAnalyzer() analyzer = OnPageSEOAnalyzer()
result = analyzer.analyze_html(html_content, base_url='https://example.com') result = analyzer.analyze_html(html_content, base_url='https://example.com')
# Technical SEO checks
checker = TechnicalSEOChecker()
tech_result = checker.check_url('https://example.com')
Author: Claude Code Author: Claude Code
Date: 2026-01-08 Date: 2026-01-08
""" """
@ -24,10 +36,13 @@ Date: 2026-01-08
import json import json
import re import re
import logging import logging
import time
import xml.etree.ElementTree as ET
from typing import Optional, Dict, List, Any, Tuple from typing import Optional, Dict, List, Any, Tuple
from dataclasses import dataclass, field, asdict from dataclasses import dataclass, field, asdict
from urllib.parse import urlparse, urljoin from urllib.parse import urlparse, urljoin
import requests
from bs4 import BeautifulSoup, Comment from bs4 import BeautifulSoup, Comment
# Configure logging # Configure logging
@ -750,6 +765,631 @@ class OnPageSEOAnalyzer:
return 0 return 0
# =============================================================================
# Technical SEO Checker
# =============================================================================
# Request configuration for TechnicalSEOChecker
REQUEST_TIMEOUT = 15
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 NordaBiznes-SEO-Checker/1.0'
# Maximum redirects to follow
MAX_REDIRECTS = 10
@dataclass
class RobotsTxtResult:
"""Analysis of robots.txt file."""
exists: bool = False
url: Optional[str] = None
status_code: Optional[int] = None
content: Optional[str] = None
content_length: Optional[int] = None
disallow_rules: List[str] = field(default_factory=list)
allow_rules: List[str] = field(default_factory=list)
sitemap_urls: List[str] = field(default_factory=list)
crawl_delay: Optional[float] = None
blocks_googlebot: bool = False
blocks_all_bots: bool = False
errors: List[str] = field(default_factory=list)
def to_dict(self) -> Dict[str, Any]:
return asdict(self)
@dataclass
class SitemapResult:
"""Analysis of sitemap.xml file."""
exists: bool = False
url: Optional[str] = None
status_code: Optional[int] = None
is_valid_xml: bool = False
is_sitemap_index: bool = False
url_count: int = 0
sitemap_count: int = 0 # For sitemap index
sample_urls: List[str] = field(default_factory=list)
last_modified: Optional[str] = None
content_length: Optional[int] = None
errors: List[str] = field(default_factory=list)
def to_dict(self) -> Dict[str, Any]:
return asdict(self)
@dataclass
class RedirectInfo:
"""Information about a single redirect."""
from_url: str
to_url: str
status_code: int
is_https_upgrade: bool = False
is_www_redirect: bool = False
def to_dict(self) -> Dict[str, Any]:
return asdict(self)
@dataclass
class RedirectChainResult:
"""Analysis of redirect chain for a URL."""
original_url: str
final_url: str
chain_length: int = 0
redirects: List[RedirectInfo] = field(default_factory=list)
has_redirect_loop: bool = False
has_mixed_content: bool = False # HTTP -> HTTPS -> HTTP
total_time_ms: Optional[int] = None
errors: List[str] = field(default_factory=list)
def to_dict(self) -> Dict[str, Any]:
result = asdict(self)
result['redirects'] = [r.to_dict() if hasattr(r, 'to_dict') else r for r in self.redirects]
return result
@dataclass
class CanonicalResult:
"""Analysis of canonical URL configuration."""
has_canonical: bool = False
canonical_url: Optional[str] = None
is_self_referencing: bool = False
points_to_different_domain: bool = False
is_relative: bool = False
is_valid_url: bool = False
matches_current_url: bool = False
errors: List[str] = field(default_factory=list)
def to_dict(self) -> Dict[str, Any]:
return asdict(self)
@dataclass
class IndexabilityResult:
"""Analysis of page indexability."""
is_indexable: bool = True
has_noindex_meta: bool = False
has_noindex_header: bool = False
noindex_source: Optional[str] = None # 'meta', 'header', 'robots.txt'
meta_robots_content: Optional[str] = None
x_robots_tag: Optional[str] = None
errors: List[str] = field(default_factory=list)
def to_dict(self) -> Dict[str, Any]:
return asdict(self)
@dataclass
class TechnicalSEOResult:
"""Complete technical SEO check result."""
url: str
checked_at: str
robots_txt: RobotsTxtResult
sitemap: SitemapResult
redirect_chain: RedirectChainResult
canonical: CanonicalResult
indexability: IndexabilityResult
errors: List[str] = field(default_factory=list)
def to_dict(self) -> Dict[str, Any]:
return {
'url': self.url,
'checked_at': self.checked_at,
'robots_txt': self.robots_txt.to_dict(),
'sitemap': self.sitemap.to_dict(),
'redirect_chain': self.redirect_chain.to_dict(),
'canonical': self.canonical.to_dict(),
'indexability': self.indexability.to_dict(),
'errors': self.errors,
}
class TechnicalSEOChecker:
"""
Checks technical SEO factors for a website.
Analyzes:
- robots.txt presence and configuration
- sitemap.xml presence and validity
- Canonical URL configuration
- Noindex tags (meta and HTTP header)
- Redirect chains
Usage:
checker = TechnicalSEOChecker()
result = checker.check_url('https://example.com')
# Access specific results
print(f"robots.txt exists: {result.robots_txt.exists}")
print(f"sitemap.xml exists: {result.sitemap.exists}")
print(f"Redirect chain length: {result.redirect_chain.chain_length}")
print(f"Is indexable: {result.indexability.is_indexable}")
"""
def __init__(self, timeout: int = REQUEST_TIMEOUT):
"""
Initialize the TechnicalSEOChecker.
Args:
timeout: Request timeout in seconds.
"""
self.timeout = timeout
self.session = requests.Session()
self.session.headers.update({'User-Agent': USER_AGENT})
def check_url(self, url: str) -> TechnicalSEOResult:
"""
Perform complete technical SEO check for a URL.
Args:
url: The URL to check.
Returns:
TechnicalSEOResult with all technical SEO analysis.
"""
from datetime import datetime
errors = []
# Normalize URL
if not url.startswith(('http://', 'https://')):
url = 'https://' + url
parsed = urlparse(url)
base_url = f"{parsed.scheme}://{parsed.netloc}"
# Check robots.txt
robots_result = self.check_robots_txt(base_url)
# Check sitemap.xml (use sitemap from robots.txt if available)
sitemap_urls = robots_result.sitemap_urls if robots_result.sitemap_urls else [f"{base_url}/sitemap.xml"]
sitemap_result = self.check_sitemap(sitemap_urls[0] if sitemap_urls else f"{base_url}/sitemap.xml")
# Check redirect chain
redirect_result = self.check_redirect_chain(url)
# Fetch page for canonical and indexability checks
canonical_result = CanonicalResult()
indexability_result = IndexabilityResult()
try:
response = self.session.get(url, timeout=self.timeout, allow_redirects=True)
final_url = response.url
# Parse HTML for canonical and noindex
if response.status_code == 200:
canonical_result = self._check_canonical(response.text, final_url)
indexability_result = self._check_indexability(response)
else:
errors.append(f"HTTP {response.status_code} when fetching page")
except requests.exceptions.Timeout:
errors.append(f"Timeout fetching {url}")
except requests.exceptions.ConnectionError as e:
errors.append(f"Connection error: {str(e)[:100]}")
except requests.exceptions.RequestException as e:
errors.append(f"Request error: {str(e)[:100]}")
return TechnicalSEOResult(
url=url,
checked_at=datetime.now().isoformat(),
robots_txt=robots_result,
sitemap=sitemap_result,
redirect_chain=redirect_result,
canonical=canonical_result,
indexability=indexability_result,
errors=errors,
)
def check_robots_txt(self, base_url: str) -> RobotsTxtResult:
"""
Check robots.txt file for a domain.
Args:
base_url: Base URL of the site (e.g., 'https://example.com').
Returns:
RobotsTxtResult with robots.txt analysis.
"""
result = RobotsTxtResult()
robots_url = f"{base_url.rstrip('/')}/robots.txt"
result.url = robots_url
try:
response = self.session.get(robots_url, timeout=self.timeout)
result.status_code = response.status_code
if response.status_code == 200:
result.exists = True
result.content = response.text
result.content_length = len(response.text)
# Parse robots.txt
self._parse_robots_txt(response.text, result)
elif response.status_code == 404:
result.exists = False
else:
result.errors.append(f"Unexpected status code: {response.status_code}")
except requests.exceptions.Timeout:
result.errors.append("Timeout fetching robots.txt")
except requests.exceptions.ConnectionError as e:
result.errors.append(f"Connection error: {str(e)[:100]}")
except requests.exceptions.RequestException as e:
result.errors.append(f"Request error: {str(e)[:100]}")
return result
def _parse_robots_txt(self, content: str, result: RobotsTxtResult) -> None:
"""Parse robots.txt content and populate result."""
current_user_agent = None
is_googlebot_section = False
is_all_section = False
for line in content.split('\n'):
line = line.strip()
# Skip empty lines and comments
if not line or line.startswith('#'):
continue
# Split on first colon
if ':' not in line:
continue
directive, value = line.split(':', 1)
directive = directive.strip().lower()
value = value.strip()
if directive == 'user-agent':
current_user_agent = value.lower()
is_googlebot_section = 'googlebot' in current_user_agent
is_all_section = current_user_agent == '*'
elif directive == 'disallow' and value:
result.disallow_rules.append(value)
# Check if blocking important paths
if value == '/' and (is_googlebot_section or is_all_section):
if is_googlebot_section:
result.blocks_googlebot = True
if is_all_section:
result.blocks_all_bots = True
elif directive == 'allow' and value:
result.allow_rules.append(value)
elif directive == 'sitemap':
if value and value not in result.sitemap_urls:
result.sitemap_urls.append(value)
elif directive == 'crawl-delay':
try:
result.crawl_delay = float(value)
except ValueError:
pass
# Deduplicate
result.disallow_rules = list(dict.fromkeys(result.disallow_rules))
result.allow_rules = list(dict.fromkeys(result.allow_rules))
def check_sitemap(self, sitemap_url: str) -> SitemapResult:
"""
Check sitemap.xml file.
Args:
sitemap_url: URL of the sitemap.
Returns:
SitemapResult with sitemap analysis.
"""
result = SitemapResult()
result.url = sitemap_url
try:
response = self.session.get(sitemap_url, timeout=self.timeout)
result.status_code = response.status_code
if response.status_code == 200:
result.exists = True
result.content_length = len(response.content)
# Check Last-Modified header
last_modified = response.headers.get('Last-Modified')
if last_modified:
result.last_modified = last_modified
# Parse XML
self._parse_sitemap(response.content, result)
elif response.status_code == 404:
result.exists = False
else:
result.errors.append(f"Unexpected status code: {response.status_code}")
except requests.exceptions.Timeout:
result.errors.append("Timeout fetching sitemap")
except requests.exceptions.ConnectionError as e:
result.errors.append(f"Connection error: {str(e)[:100]}")
except requests.exceptions.RequestException as e:
result.errors.append(f"Request error: {str(e)[:100]}")
return result
def _parse_sitemap(self, content: bytes, result: SitemapResult) -> None:
"""Parse sitemap XML content and populate result."""
try:
# Try to parse as XML
root = ET.fromstring(content)
result.is_valid_xml = True
# Check namespace (handle both with and without namespace)
ns = {'sm': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
# Check if it's a sitemap index
sitemap_tags = root.findall('.//sm:sitemap', ns) or root.findall('.//sitemap')
if sitemap_tags:
result.is_sitemap_index = True
result.sitemap_count = len(sitemap_tags)
# Get sample sitemap URLs
for sitemap_tag in sitemap_tags[:5]:
loc = sitemap_tag.find('sm:loc', ns) or sitemap_tag.find('loc')
if loc is not None and loc.text:
result.sample_urls.append(loc.text)
else:
# Regular sitemap
url_tags = root.findall('.//sm:url', ns) or root.findall('.//url')
result.url_count = len(url_tags)
# Get sample URLs
for url_tag in url_tags[:10]:
loc = url_tag.find('sm:loc', ns) or url_tag.find('loc')
if loc is not None and loc.text:
result.sample_urls.append(loc.text)
except ET.ParseError as e:
result.is_valid_xml = False
result.errors.append(f"Invalid XML: {str(e)[:100]}")
except Exception as e:
result.errors.append(f"Error parsing sitemap: {str(e)[:100]}")
def check_redirect_chain(self, url: str) -> RedirectChainResult:
"""
Check redirect chain for a URL.
Args:
url: The URL to check.
Returns:
RedirectChainResult with redirect chain analysis.
"""
result = RedirectChainResult(original_url=url, final_url=url)
visited_urls = set()
current_url = url
start_time = time.time()
for i in range(MAX_REDIRECTS):
if current_url in visited_urls:
result.has_redirect_loop = True
result.errors.append(f"Redirect loop detected at: {current_url}")
break
visited_urls.add(current_url)
try:
response = self.session.get(
current_url,
timeout=self.timeout,
allow_redirects=False
)
# Check for redirect
if response.status_code in (301, 302, 303, 307, 308):
next_url = response.headers.get('Location')
if not next_url:
result.errors.append("Redirect without Location header")
break
# Handle relative redirects
if not next_url.startswith(('http://', 'https://')):
parsed = urlparse(current_url)
if next_url.startswith('/'):
next_url = f"{parsed.scheme}://{parsed.netloc}{next_url}"
else:
next_url = urljoin(current_url, next_url)
# Create redirect info
parsed_from = urlparse(current_url)
parsed_to = urlparse(next_url)
redirect_info = RedirectInfo(
from_url=current_url,
to_url=next_url,
status_code=response.status_code,
is_https_upgrade=(
parsed_from.scheme == 'http' and
parsed_to.scheme == 'https' and
parsed_from.netloc.replace('www.', '') == parsed_to.netloc.replace('www.', '')
),
is_www_redirect=(
parsed_from.netloc.replace('www.', '') == parsed_to.netloc.replace('www.', '') and
parsed_from.netloc != parsed_to.netloc
)
)
result.redirects.append(redirect_info)
# Check for mixed content
if len(result.redirects) >= 2:
schemes = [urlparse(r.from_url).scheme for r in result.redirects]
schemes.append(parsed_to.scheme)
if 'http' in schemes and 'https' in schemes:
if schemes.index('https') < len(schemes) - 1 and 'http' in schemes[schemes.index('https'):]:
result.has_mixed_content = True
current_url = next_url
else:
# No more redirects
result.final_url = current_url
break
except requests.exceptions.Timeout:
result.errors.append(f"Timeout at: {current_url}")
break
except requests.exceptions.ConnectionError as e:
result.errors.append(f"Connection error at {current_url}: {str(e)[:50]}")
break
except requests.exceptions.RequestException as e:
result.errors.append(f"Request error: {str(e)[:100]}")
break
result.chain_length = len(result.redirects)
result.total_time_ms = int((time.time() - start_time) * 1000)
return result
def _check_canonical(self, html: str, current_url: str) -> CanonicalResult:
"""
Check canonical URL configuration from HTML.
Args:
html: HTML content of the page.
current_url: Current URL of the page.
Returns:
CanonicalResult with canonical URL analysis.
"""
result = CanonicalResult()
try:
soup = BeautifulSoup(html, 'lxml')
except Exception:
try:
soup = BeautifulSoup(html, 'html.parser')
except Exception as e:
result.errors.append(f"Failed to parse HTML: {str(e)[:100]}")
return result
# Find canonical link
canonical_tag = soup.find('link', rel='canonical')
if canonical_tag:
result.has_canonical = True
canonical_url = canonical_tag.get('href', '')
result.canonical_url = canonical_url
if canonical_url:
# Check if relative
result.is_relative = not canonical_url.startswith(('http://', 'https://'))
# Parse canonical URL
if result.is_relative:
# Make it absolute for comparison
parsed_current = urlparse(current_url)
if canonical_url.startswith('/'):
canonical_abs = f"{parsed_current.scheme}://{parsed_current.netloc}{canonical_url}"
else:
canonical_abs = urljoin(current_url, canonical_url)
else:
canonical_abs = canonical_url
parsed_canonical = urlparse(canonical_abs)
parsed_current = urlparse(current_url)
# Check if valid URL
result.is_valid_url = bool(parsed_canonical.scheme and parsed_canonical.netloc)
# Check if self-referencing
result.is_self_referencing = (
parsed_canonical.netloc.replace('www.', '') == parsed_current.netloc.replace('www.', '') and
parsed_canonical.path == parsed_current.path
)
# Check if points to different domain
result.points_to_different_domain = (
parsed_canonical.netloc.replace('www.', '') != parsed_current.netloc.replace('www.', '')
)
# Check if matches current URL exactly
result.matches_current_url = (canonical_abs.rstrip('/') == current_url.rstrip('/'))
return result
def _check_indexability(self, response: requests.Response) -> IndexabilityResult:
"""
Check if page is indexable based on meta tags and HTTP headers.
Args:
response: Response object from fetching the page.
Returns:
IndexabilityResult with indexability analysis.
"""
result = IndexabilityResult()
# Check X-Robots-Tag HTTP header
x_robots = response.headers.get('X-Robots-Tag', '')
if x_robots:
result.x_robots_tag = x_robots
if 'noindex' in x_robots.lower():
result.has_noindex_header = True
result.is_indexable = False
result.noindex_source = 'header'
# Check meta robots tag in HTML
try:
soup = BeautifulSoup(response.text, 'lxml')
except Exception:
try:
soup = BeautifulSoup(response.text, 'html.parser')
except Exception as e:
result.errors.append(f"Failed to parse HTML: {str(e)[:100]}")
return result
# Find meta robots
meta_robots = soup.find('meta', attrs={'name': re.compile(r'^robots$', re.I)})
if meta_robots:
content = meta_robots.get('content', '')
result.meta_robots_content = content
if 'noindex' in content.lower():
result.has_noindex_meta = True
result.is_indexable = False
if not result.noindex_source:
result.noindex_source = 'meta'
# Also check googlebot-specific meta
meta_googlebot = soup.find('meta', attrs={'name': re.compile(r'^googlebot$', re.I)})
if meta_googlebot:
content = meta_googlebot.get('content', '')
if 'noindex' in content.lower():
result.has_noindex_meta = True
result.is_indexable = False
if not result.noindex_source:
result.noindex_source = 'meta'
return result
# Convenience function # Convenience function
def analyze_html(html: str, base_url: str = '') -> Dict[str, Any]: def analyze_html(html: str, base_url: str = '') -> Dict[str, Any]:
""" """
@ -767,20 +1407,139 @@ def analyze_html(html: str, base_url: str = '') -> Dict[str, Any]:
return result.to_dict() return result.to_dict()
def check_technical_seo(url: str) -> Dict[str, Any]:
"""
Convenience function for technical SEO check.
Args:
url: The URL to check.
Returns:
Dict with technical SEO analysis results.
"""
checker = TechnicalSEOChecker()
result = checker.check_url(url)
return result.to_dict()
if __name__ == '__main__': if __name__ == '__main__':
import sys import sys
import requests import argparse
if len(sys.argv) < 2: parser = argparse.ArgumentParser(description='SEO Analyzer for websites')
print("Usage: python seo_analyzer.py <url>") parser.add_argument('url', help='URL to analyze')
print("Example: python seo_analyzer.py https://pixlab.pl") parser.add_argument('--technical', '-t', action='store_true',
sys.exit(1) help='Run technical SEO checks (robots.txt, sitemap, redirects)')
parser.add_argument('--all', '-a', action='store_true',
help='Run both on-page and technical SEO analysis')
parser.add_argument('--json', '-j', action='store_true',
help='Output results as JSON')
test_url = sys.argv[1] args = parser.parse_args()
test_url = args.url
print(f"Analyzing: {test_url}") print(f"Analyzing: {test_url}")
print("-" * 60) print("-" * 60)
# Run technical SEO checks if requested
if args.technical or args.all:
print("\n" + "=" * 60)
print("TECHNICAL SEO ANALYSIS")
print("=" * 60)
checker = TechnicalSEOChecker()
tech_result = checker.check_url(test_url)
if args.json:
print(json.dumps(tech_result.to_dict(), indent=2, default=str))
else:
print("\n=== ROBOTS.TXT ===")
print(f"Exists: {tech_result.robots_txt.exists}")
print(f"URL: {tech_result.robots_txt.url}")
print(f"Status code: {tech_result.robots_txt.status_code}")
if tech_result.robots_txt.exists:
print(f"Disallow rules: {len(tech_result.robots_txt.disallow_rules)}")
if tech_result.robots_txt.disallow_rules[:5]:
print(f" Sample: {tech_result.robots_txt.disallow_rules[:5]}")
print(f"Sitemap URLs: {tech_result.robots_txt.sitemap_urls}")
print(f"Blocks Googlebot: {tech_result.robots_txt.blocks_googlebot}")
print(f"Blocks all bots: {tech_result.robots_txt.blocks_all_bots}")
if tech_result.robots_txt.crawl_delay:
print(f"Crawl delay: {tech_result.robots_txt.crawl_delay}")
if tech_result.robots_txt.errors:
print(f"Errors: {tech_result.robots_txt.errors}")
print("\n=== SITEMAP ===")
print(f"Exists: {tech_result.sitemap.exists}")
print(f"URL: {tech_result.sitemap.url}")
print(f"Status code: {tech_result.sitemap.status_code}")
if tech_result.sitemap.exists:
print(f"Valid XML: {tech_result.sitemap.is_valid_xml}")
print(f"Is sitemap index: {tech_result.sitemap.is_sitemap_index}")
if tech_result.sitemap.is_sitemap_index:
print(f"Sitemap count: {tech_result.sitemap.sitemap_count}")
else:
print(f"URL count: {tech_result.sitemap.url_count}")
if tech_result.sitemap.sample_urls:
print(f"Sample URLs: {tech_result.sitemap.sample_urls[:3]}")
if tech_result.sitemap.errors:
print(f"Errors: {tech_result.sitemap.errors}")
print("\n=== REDIRECT CHAIN ===")
print(f"Original URL: {tech_result.redirect_chain.original_url}")
print(f"Final URL: {tech_result.redirect_chain.final_url}")
print(f"Chain length: {tech_result.redirect_chain.chain_length}")
if tech_result.redirect_chain.redirects:
for i, r in enumerate(tech_result.redirect_chain.redirects[:5]):
print(f" [{i+1}] {r.status_code}: {r.from_url[:50]}... -> {r.to_url[:50]}...")
if r.is_https_upgrade:
print(f" (HTTPS upgrade)")
if r.is_www_redirect:
print(f" (www redirect)")
print(f"Has redirect loop: {tech_result.redirect_chain.has_redirect_loop}")
print(f"Has mixed content: {tech_result.redirect_chain.has_mixed_content}")
print(f"Total time: {tech_result.redirect_chain.total_time_ms}ms")
if tech_result.redirect_chain.errors:
print(f"Errors: {tech_result.redirect_chain.errors}")
print("\n=== CANONICAL ===")
print(f"Has canonical: {tech_result.canonical.has_canonical}")
if tech_result.canonical.has_canonical:
print(f"Canonical URL: {tech_result.canonical.canonical_url}")
print(f"Is self-referencing: {tech_result.canonical.is_self_referencing}")
print(f"Points to different domain: {tech_result.canonical.points_to_different_domain}")
print(f"Is relative: {tech_result.canonical.is_relative}")
print(f"Is valid URL: {tech_result.canonical.is_valid_url}")
if tech_result.canonical.errors:
print(f"Errors: {tech_result.canonical.errors}")
print("\n=== INDEXABILITY ===")
print(f"Is indexable: {tech_result.indexability.is_indexable}")
print(f"Has noindex meta: {tech_result.indexability.has_noindex_meta}")
print(f"Has noindex header: {tech_result.indexability.has_noindex_header}")
if tech_result.indexability.noindex_source:
print(f"Noindex source: {tech_result.indexability.noindex_source}")
if tech_result.indexability.meta_robots_content:
print(f"Meta robots: {tech_result.indexability.meta_robots_content}")
if tech_result.indexability.x_robots_tag:
print(f"X-Robots-Tag: {tech_result.indexability.x_robots_tag}")
if tech_result.indexability.errors:
print(f"Errors: {tech_result.indexability.errors}")
if tech_result.errors:
print(f"\n=== GENERAL ERRORS ===")
for error in tech_result.errors:
print(f" - {error}")
# If only technical was requested, exit
if not args.all:
sys.exit(0)
# Run on-page analysis (default behavior)
print("\n" + "=" * 60)
print("ON-PAGE SEO ANALYSIS")
print("=" * 60)
# Fetch the page # Fetch the page
try: try:
headers = { headers = {
@ -797,65 +1556,68 @@ if __name__ == '__main__':
analyzer = OnPageSEOAnalyzer() analyzer = OnPageSEOAnalyzer()
result = analyzer.analyze_html(html, test_url) result = analyzer.analyze_html(html, test_url)
# Print results if args.json:
print("\n=== META TAGS ===") print(json.dumps(result.to_dict(), indent=2, default=str))
print(f"Title: {result.meta_tags.title}") else:
print(f"Title length: {result.meta_tags.title_length}") # Print results
print(f"Description: {result.meta_tags.description[:100] if result.meta_tags.description else 'None'}...") print("\n=== META TAGS ===")
print(f"Description length: {result.meta_tags.description_length}") print(f"Title: {result.meta_tags.title}")
print(f"Canonical: {result.meta_tags.canonical_url}") print(f"Title length: {result.meta_tags.title_length}")
print(f"Robots: {result.meta_tags.robots}") print(f"Description: {result.meta_tags.description[:100] if result.meta_tags.description else 'None'}...")
print(f"Viewport: {result.meta_tags.viewport}") print(f"Description length: {result.meta_tags.description_length}")
print(f"Canonical: {result.meta_tags.canonical_url}")
print(f"Robots: {result.meta_tags.robots}")
print(f"Viewport: {result.meta_tags.viewport}")
print("\n=== OPEN GRAPH ===") print("\n=== OPEN GRAPH ===")
print(f"OG Title: {result.open_graph.og_title}") print(f"OG Title: {result.open_graph.og_title}")
print(f"OG Description: {result.open_graph.og_description[:100] if result.open_graph.og_description else 'None'}...") print(f"OG Description: {result.open_graph.og_description[:100] if result.open_graph.og_description else 'None'}...")
print(f"OG Image: {result.open_graph.og_image}") print(f"OG Image: {result.open_graph.og_image}")
print(f"OG Type: {result.open_graph.og_type}") print(f"OG Type: {result.open_graph.og_type}")
print("\n=== TWITTER CARD ===") print("\n=== TWITTER CARD ===")
print(f"Card Type: {result.twitter_card.card_type}") print(f"Card Type: {result.twitter_card.card_type}")
print(f"Title: {result.twitter_card.title}") print(f"Title: {result.twitter_card.title}")
print("\n=== HEADINGS ===") print("\n=== HEADINGS ===")
print(f"H1: {result.headings.h1_count} ({result.headings.h1_texts})") print(f"H1: {result.headings.h1_count} ({result.headings.h1_texts})")
print(f"H2: {result.headings.h2_count}") print(f"H2: {result.headings.h2_count}")
print(f"H3: {result.headings.h3_count}") print(f"H3: {result.headings.h3_count}")
print(f"H4: {result.headings.h4_count}") print(f"H4: {result.headings.h4_count}")
print(f"H5: {result.headings.h5_count}") print(f"H5: {result.headings.h5_count}")
print(f"H6: {result.headings.h6_count}") print(f"H6: {result.headings.h6_count}")
print(f"Has single H1: {result.headings.has_single_h1}") print(f"Has single H1: {result.headings.has_single_h1}")
print(f"Has proper hierarchy: {result.headings.has_proper_hierarchy}") print(f"Has proper hierarchy: {result.headings.has_proper_hierarchy}")
if result.headings.hierarchy_issues: if result.headings.hierarchy_issues:
print(f"Hierarchy issues: {result.headings.hierarchy_issues}") print(f"Hierarchy issues: {result.headings.hierarchy_issues}")
print("\n=== IMAGES ===") print("\n=== IMAGES ===")
print(f"Total images: {result.images.total_images}") print(f"Total images: {result.images.total_images}")
print(f"With alt: {result.images.images_with_alt}") print(f"With alt: {result.images.images_with_alt}")
print(f"Without alt: {result.images.images_without_alt}") print(f"Without alt: {result.images.images_without_alt}")
print(f"With empty alt: {result.images.images_with_empty_alt}") print(f"With empty alt: {result.images.images_with_empty_alt}")
if result.images.alt_text_quality_issues: if result.images.alt_text_quality_issues:
print(f"Alt quality issues: {len(result.images.alt_text_quality_issues)}") print(f"Alt quality issues: {len(result.images.alt_text_quality_issues)}")
print("\n=== LINKS ===") print("\n=== LINKS ===")
print(f"Total links: {result.links.total_links}") print(f"Total links: {result.links.total_links}")
print(f"Internal: {result.links.internal_links}") print(f"Internal: {result.links.internal_links}")
print(f"External: {result.links.external_links}") print(f"External: {result.links.external_links}")
print(f"Nofollow: {result.links.nofollow_links}") print(f"Nofollow: {result.links.nofollow_links}")
print(f"Broken anchor links: {result.links.broken_anchor_links}") print(f"Broken anchor links: {result.links.broken_anchor_links}")
print(f"External domains: {result.links.unique_external_domains[:5]}") print(f"External domains: {result.links.unique_external_domains[:5]}")
print("\n=== STRUCTURED DATA ===") print("\n=== STRUCTURED DATA ===")
print(f"Has structured data: {result.structured_data.has_structured_data}") print(f"Has structured data: {result.structured_data.has_structured_data}")
print(f"JSON-LD count: {result.structured_data.json_ld_count}") print(f"JSON-LD count: {result.structured_data.json_ld_count}")
print(f"Microdata count: {result.structured_data.microdata_count}") print(f"Microdata count: {result.structured_data.microdata_count}")
print(f"RDFa count: {result.structured_data.rdfa_count}") print(f"RDFa count: {result.structured_data.rdfa_count}")
print(f"Schema types: {result.structured_data.all_types}") print(f"Schema types: {result.structured_data.all_types}")
print("\n=== OTHER ===") print("\n=== OTHER ===")
print(f"Word count: {result.word_count}") print(f"Word count: {result.word_count}")
print(f"Has DOCTYPE: {result.has_doctype}") print(f"Has DOCTYPE: {result.has_doctype}")
print(f"Has lang attribute: {result.has_lang_attribute} ({result.lang_attribute})") print(f"Has lang attribute: {result.has_lang_attribute} ({result.lang_attribute})")
if result.errors: if result.errors:
print(f"\nErrors: {result.errors}") print(f"\nErrors: {result.errors}")