auto-claude: 3.2 - Add TechnicalSEOChecker class to scripts/seo_analyzer.py
Adds TechnicalSEOChecker class that performs technical SEO audits: - robots.txt: checks existence, parses directives (Disallow, Allow, Sitemap) detects if blocks Googlebot or all bots - sitemap.xml: checks existence, validates XML, counts URLs, detects sitemap index - Canonical URLs: detects canonical tag, checks if self-referencing or cross-domain - Noindex tags: checks meta robots and X-Robots-Tag HTTP header - Redirect chains: follows up to 10 redirects, detects loops, HTTPS upgrades, www redirects, and mixed content issues Includes: - 8 dataclasses for structured results (RobotsTxtResult, SitemapResult, etc.) - TechnicalSEOResult container for complete analysis - check_technical_seo() convenience function - CLI support: --technical/-t flag for technical-only analysis - --all/-a flag for combined on-page and technical analysis - --json/-j flag for JSON output 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
0c257f5e48
commit
81fc27dfa9
@ -11,12 +11,24 @@ Analyzes HTML content for SEO factors including:
|
||||
- Structured data detection (JSON-LD, Microdata, RDFa)
|
||||
- Open Graph and Twitter Card metadata
|
||||
|
||||
Usage:
|
||||
from seo_analyzer import OnPageSEOAnalyzer
|
||||
Also includes TechnicalSEOChecker for:
|
||||
- robots.txt analysis
|
||||
- sitemap.xml validation
|
||||
- Canonical URL verification
|
||||
- Noindex tag detection
|
||||
- Redirect chain analysis
|
||||
|
||||
Usage:
|
||||
from seo_analyzer import OnPageSEOAnalyzer, TechnicalSEOChecker
|
||||
|
||||
# On-page analysis
|
||||
analyzer = OnPageSEOAnalyzer()
|
||||
result = analyzer.analyze_html(html_content, base_url='https://example.com')
|
||||
|
||||
# Technical SEO checks
|
||||
checker = TechnicalSEOChecker()
|
||||
tech_result = checker.check_url('https://example.com')
|
||||
|
||||
Author: Claude Code
|
||||
Date: 2026-01-08
|
||||
"""
|
||||
@ -24,10 +36,13 @@ Date: 2026-01-08
|
||||
import json
|
||||
import re
|
||||
import logging
|
||||
import time
|
||||
import xml.etree.ElementTree as ET
|
||||
from typing import Optional, Dict, List, Any, Tuple
|
||||
from dataclasses import dataclass, field, asdict
|
||||
from urllib.parse import urlparse, urljoin
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup, Comment
|
||||
|
||||
# Configure logging
|
||||
@ -750,6 +765,631 @@ class OnPageSEOAnalyzer:
|
||||
return 0
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Technical SEO Checker
|
||||
# =============================================================================
|
||||
|
||||
# Request configuration for TechnicalSEOChecker
|
||||
REQUEST_TIMEOUT = 15
|
||||
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 NordaBiznes-SEO-Checker/1.0'
|
||||
|
||||
# Maximum redirects to follow
|
||||
MAX_REDIRECTS = 10
|
||||
|
||||
|
||||
@dataclass
|
||||
class RobotsTxtResult:
|
||||
"""Analysis of robots.txt file."""
|
||||
exists: bool = False
|
||||
url: Optional[str] = None
|
||||
status_code: Optional[int] = None
|
||||
content: Optional[str] = None
|
||||
content_length: Optional[int] = None
|
||||
disallow_rules: List[str] = field(default_factory=list)
|
||||
allow_rules: List[str] = field(default_factory=list)
|
||||
sitemap_urls: List[str] = field(default_factory=list)
|
||||
crawl_delay: Optional[float] = None
|
||||
blocks_googlebot: bool = False
|
||||
blocks_all_bots: bool = False
|
||||
errors: List[str] = field(default_factory=list)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return asdict(self)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SitemapResult:
|
||||
"""Analysis of sitemap.xml file."""
|
||||
exists: bool = False
|
||||
url: Optional[str] = None
|
||||
status_code: Optional[int] = None
|
||||
is_valid_xml: bool = False
|
||||
is_sitemap_index: bool = False
|
||||
url_count: int = 0
|
||||
sitemap_count: int = 0 # For sitemap index
|
||||
sample_urls: List[str] = field(default_factory=list)
|
||||
last_modified: Optional[str] = None
|
||||
content_length: Optional[int] = None
|
||||
errors: List[str] = field(default_factory=list)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return asdict(self)
|
||||
|
||||
|
||||
@dataclass
|
||||
class RedirectInfo:
|
||||
"""Information about a single redirect."""
|
||||
from_url: str
|
||||
to_url: str
|
||||
status_code: int
|
||||
is_https_upgrade: bool = False
|
||||
is_www_redirect: bool = False
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return asdict(self)
|
||||
|
||||
|
||||
@dataclass
|
||||
class RedirectChainResult:
|
||||
"""Analysis of redirect chain for a URL."""
|
||||
original_url: str
|
||||
final_url: str
|
||||
chain_length: int = 0
|
||||
redirects: List[RedirectInfo] = field(default_factory=list)
|
||||
has_redirect_loop: bool = False
|
||||
has_mixed_content: bool = False # HTTP -> HTTPS -> HTTP
|
||||
total_time_ms: Optional[int] = None
|
||||
errors: List[str] = field(default_factory=list)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
result = asdict(self)
|
||||
result['redirects'] = [r.to_dict() if hasattr(r, 'to_dict') else r for r in self.redirects]
|
||||
return result
|
||||
|
||||
|
||||
@dataclass
|
||||
class CanonicalResult:
|
||||
"""Analysis of canonical URL configuration."""
|
||||
has_canonical: bool = False
|
||||
canonical_url: Optional[str] = None
|
||||
is_self_referencing: bool = False
|
||||
points_to_different_domain: bool = False
|
||||
is_relative: bool = False
|
||||
is_valid_url: bool = False
|
||||
matches_current_url: bool = False
|
||||
errors: List[str] = field(default_factory=list)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return asdict(self)
|
||||
|
||||
|
||||
@dataclass
|
||||
class IndexabilityResult:
|
||||
"""Analysis of page indexability."""
|
||||
is_indexable: bool = True
|
||||
has_noindex_meta: bool = False
|
||||
has_noindex_header: bool = False
|
||||
noindex_source: Optional[str] = None # 'meta', 'header', 'robots.txt'
|
||||
meta_robots_content: Optional[str] = None
|
||||
x_robots_tag: Optional[str] = None
|
||||
errors: List[str] = field(default_factory=list)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return asdict(self)
|
||||
|
||||
|
||||
@dataclass
|
||||
class TechnicalSEOResult:
|
||||
"""Complete technical SEO check result."""
|
||||
url: str
|
||||
checked_at: str
|
||||
robots_txt: RobotsTxtResult
|
||||
sitemap: SitemapResult
|
||||
redirect_chain: RedirectChainResult
|
||||
canonical: CanonicalResult
|
||||
indexability: IndexabilityResult
|
||||
errors: List[str] = field(default_factory=list)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
'url': self.url,
|
||||
'checked_at': self.checked_at,
|
||||
'robots_txt': self.robots_txt.to_dict(),
|
||||
'sitemap': self.sitemap.to_dict(),
|
||||
'redirect_chain': self.redirect_chain.to_dict(),
|
||||
'canonical': self.canonical.to_dict(),
|
||||
'indexability': self.indexability.to_dict(),
|
||||
'errors': self.errors,
|
||||
}
|
||||
|
||||
|
||||
class TechnicalSEOChecker:
|
||||
"""
|
||||
Checks technical SEO factors for a website.
|
||||
|
||||
Analyzes:
|
||||
- robots.txt presence and configuration
|
||||
- sitemap.xml presence and validity
|
||||
- Canonical URL configuration
|
||||
- Noindex tags (meta and HTTP header)
|
||||
- Redirect chains
|
||||
|
||||
Usage:
|
||||
checker = TechnicalSEOChecker()
|
||||
result = checker.check_url('https://example.com')
|
||||
|
||||
# Access specific results
|
||||
print(f"robots.txt exists: {result.robots_txt.exists}")
|
||||
print(f"sitemap.xml exists: {result.sitemap.exists}")
|
||||
print(f"Redirect chain length: {result.redirect_chain.chain_length}")
|
||||
print(f"Is indexable: {result.indexability.is_indexable}")
|
||||
"""
|
||||
|
||||
def __init__(self, timeout: int = REQUEST_TIMEOUT):
|
||||
"""
|
||||
Initialize the TechnicalSEOChecker.
|
||||
|
||||
Args:
|
||||
timeout: Request timeout in seconds.
|
||||
"""
|
||||
self.timeout = timeout
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({'User-Agent': USER_AGENT})
|
||||
|
||||
def check_url(self, url: str) -> TechnicalSEOResult:
|
||||
"""
|
||||
Perform complete technical SEO check for a URL.
|
||||
|
||||
Args:
|
||||
url: The URL to check.
|
||||
|
||||
Returns:
|
||||
TechnicalSEOResult with all technical SEO analysis.
|
||||
"""
|
||||
from datetime import datetime
|
||||
|
||||
errors = []
|
||||
|
||||
# Normalize URL
|
||||
if not url.startswith(('http://', 'https://')):
|
||||
url = 'https://' + url
|
||||
|
||||
parsed = urlparse(url)
|
||||
base_url = f"{parsed.scheme}://{parsed.netloc}"
|
||||
|
||||
# Check robots.txt
|
||||
robots_result = self.check_robots_txt(base_url)
|
||||
|
||||
# Check sitemap.xml (use sitemap from robots.txt if available)
|
||||
sitemap_urls = robots_result.sitemap_urls if robots_result.sitemap_urls else [f"{base_url}/sitemap.xml"]
|
||||
sitemap_result = self.check_sitemap(sitemap_urls[0] if sitemap_urls else f"{base_url}/sitemap.xml")
|
||||
|
||||
# Check redirect chain
|
||||
redirect_result = self.check_redirect_chain(url)
|
||||
|
||||
# Fetch page for canonical and indexability checks
|
||||
canonical_result = CanonicalResult()
|
||||
indexability_result = IndexabilityResult()
|
||||
|
||||
try:
|
||||
response = self.session.get(url, timeout=self.timeout, allow_redirects=True)
|
||||
final_url = response.url
|
||||
|
||||
# Parse HTML for canonical and noindex
|
||||
if response.status_code == 200:
|
||||
canonical_result = self._check_canonical(response.text, final_url)
|
||||
indexability_result = self._check_indexability(response)
|
||||
else:
|
||||
errors.append(f"HTTP {response.status_code} when fetching page")
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
errors.append(f"Timeout fetching {url}")
|
||||
except requests.exceptions.ConnectionError as e:
|
||||
errors.append(f"Connection error: {str(e)[:100]}")
|
||||
except requests.exceptions.RequestException as e:
|
||||
errors.append(f"Request error: {str(e)[:100]}")
|
||||
|
||||
return TechnicalSEOResult(
|
||||
url=url,
|
||||
checked_at=datetime.now().isoformat(),
|
||||
robots_txt=robots_result,
|
||||
sitemap=sitemap_result,
|
||||
redirect_chain=redirect_result,
|
||||
canonical=canonical_result,
|
||||
indexability=indexability_result,
|
||||
errors=errors,
|
||||
)
|
||||
|
||||
def check_robots_txt(self, base_url: str) -> RobotsTxtResult:
|
||||
"""
|
||||
Check robots.txt file for a domain.
|
||||
|
||||
Args:
|
||||
base_url: Base URL of the site (e.g., 'https://example.com').
|
||||
|
||||
Returns:
|
||||
RobotsTxtResult with robots.txt analysis.
|
||||
"""
|
||||
result = RobotsTxtResult()
|
||||
robots_url = f"{base_url.rstrip('/')}/robots.txt"
|
||||
result.url = robots_url
|
||||
|
||||
try:
|
||||
response = self.session.get(robots_url, timeout=self.timeout)
|
||||
result.status_code = response.status_code
|
||||
|
||||
if response.status_code == 200:
|
||||
result.exists = True
|
||||
result.content = response.text
|
||||
result.content_length = len(response.text)
|
||||
|
||||
# Parse robots.txt
|
||||
self._parse_robots_txt(response.text, result)
|
||||
elif response.status_code == 404:
|
||||
result.exists = False
|
||||
else:
|
||||
result.errors.append(f"Unexpected status code: {response.status_code}")
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
result.errors.append("Timeout fetching robots.txt")
|
||||
except requests.exceptions.ConnectionError as e:
|
||||
result.errors.append(f"Connection error: {str(e)[:100]}")
|
||||
except requests.exceptions.RequestException as e:
|
||||
result.errors.append(f"Request error: {str(e)[:100]}")
|
||||
|
||||
return result
|
||||
|
||||
def _parse_robots_txt(self, content: str, result: RobotsTxtResult) -> None:
|
||||
"""Parse robots.txt content and populate result."""
|
||||
current_user_agent = None
|
||||
is_googlebot_section = False
|
||||
is_all_section = False
|
||||
|
||||
for line in content.split('\n'):
|
||||
line = line.strip()
|
||||
|
||||
# Skip empty lines and comments
|
||||
if not line or line.startswith('#'):
|
||||
continue
|
||||
|
||||
# Split on first colon
|
||||
if ':' not in line:
|
||||
continue
|
||||
|
||||
directive, value = line.split(':', 1)
|
||||
directive = directive.strip().lower()
|
||||
value = value.strip()
|
||||
|
||||
if directive == 'user-agent':
|
||||
current_user_agent = value.lower()
|
||||
is_googlebot_section = 'googlebot' in current_user_agent
|
||||
is_all_section = current_user_agent == '*'
|
||||
|
||||
elif directive == 'disallow' and value:
|
||||
result.disallow_rules.append(value)
|
||||
# Check if blocking important paths
|
||||
if value == '/' and (is_googlebot_section or is_all_section):
|
||||
if is_googlebot_section:
|
||||
result.blocks_googlebot = True
|
||||
if is_all_section:
|
||||
result.blocks_all_bots = True
|
||||
|
||||
elif directive == 'allow' and value:
|
||||
result.allow_rules.append(value)
|
||||
|
||||
elif directive == 'sitemap':
|
||||
if value and value not in result.sitemap_urls:
|
||||
result.sitemap_urls.append(value)
|
||||
|
||||
elif directive == 'crawl-delay':
|
||||
try:
|
||||
result.crawl_delay = float(value)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Deduplicate
|
||||
result.disallow_rules = list(dict.fromkeys(result.disallow_rules))
|
||||
result.allow_rules = list(dict.fromkeys(result.allow_rules))
|
||||
|
||||
def check_sitemap(self, sitemap_url: str) -> SitemapResult:
|
||||
"""
|
||||
Check sitemap.xml file.
|
||||
|
||||
Args:
|
||||
sitemap_url: URL of the sitemap.
|
||||
|
||||
Returns:
|
||||
SitemapResult with sitemap analysis.
|
||||
"""
|
||||
result = SitemapResult()
|
||||
result.url = sitemap_url
|
||||
|
||||
try:
|
||||
response = self.session.get(sitemap_url, timeout=self.timeout)
|
||||
result.status_code = response.status_code
|
||||
|
||||
if response.status_code == 200:
|
||||
result.exists = True
|
||||
result.content_length = len(response.content)
|
||||
|
||||
# Check Last-Modified header
|
||||
last_modified = response.headers.get('Last-Modified')
|
||||
if last_modified:
|
||||
result.last_modified = last_modified
|
||||
|
||||
# Parse XML
|
||||
self._parse_sitemap(response.content, result)
|
||||
|
||||
elif response.status_code == 404:
|
||||
result.exists = False
|
||||
else:
|
||||
result.errors.append(f"Unexpected status code: {response.status_code}")
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
result.errors.append("Timeout fetching sitemap")
|
||||
except requests.exceptions.ConnectionError as e:
|
||||
result.errors.append(f"Connection error: {str(e)[:100]}")
|
||||
except requests.exceptions.RequestException as e:
|
||||
result.errors.append(f"Request error: {str(e)[:100]}")
|
||||
|
||||
return result
|
||||
|
||||
def _parse_sitemap(self, content: bytes, result: SitemapResult) -> None:
|
||||
"""Parse sitemap XML content and populate result."""
|
||||
try:
|
||||
# Try to parse as XML
|
||||
root = ET.fromstring(content)
|
||||
result.is_valid_xml = True
|
||||
|
||||
# Check namespace (handle both with and without namespace)
|
||||
ns = {'sm': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
|
||||
|
||||
# Check if it's a sitemap index
|
||||
sitemap_tags = root.findall('.//sm:sitemap', ns) or root.findall('.//sitemap')
|
||||
if sitemap_tags:
|
||||
result.is_sitemap_index = True
|
||||
result.sitemap_count = len(sitemap_tags)
|
||||
|
||||
# Get sample sitemap URLs
|
||||
for sitemap_tag in sitemap_tags[:5]:
|
||||
loc = sitemap_tag.find('sm:loc', ns) or sitemap_tag.find('loc')
|
||||
if loc is not None and loc.text:
|
||||
result.sample_urls.append(loc.text)
|
||||
else:
|
||||
# Regular sitemap
|
||||
url_tags = root.findall('.//sm:url', ns) or root.findall('.//url')
|
||||
result.url_count = len(url_tags)
|
||||
|
||||
# Get sample URLs
|
||||
for url_tag in url_tags[:10]:
|
||||
loc = url_tag.find('sm:loc', ns) or url_tag.find('loc')
|
||||
if loc is not None and loc.text:
|
||||
result.sample_urls.append(loc.text)
|
||||
|
||||
except ET.ParseError as e:
|
||||
result.is_valid_xml = False
|
||||
result.errors.append(f"Invalid XML: {str(e)[:100]}")
|
||||
except Exception as e:
|
||||
result.errors.append(f"Error parsing sitemap: {str(e)[:100]}")
|
||||
|
||||
def check_redirect_chain(self, url: str) -> RedirectChainResult:
|
||||
"""
|
||||
Check redirect chain for a URL.
|
||||
|
||||
Args:
|
||||
url: The URL to check.
|
||||
|
||||
Returns:
|
||||
RedirectChainResult with redirect chain analysis.
|
||||
"""
|
||||
result = RedirectChainResult(original_url=url, final_url=url)
|
||||
visited_urls = set()
|
||||
current_url = url
|
||||
start_time = time.time()
|
||||
|
||||
for i in range(MAX_REDIRECTS):
|
||||
if current_url in visited_urls:
|
||||
result.has_redirect_loop = True
|
||||
result.errors.append(f"Redirect loop detected at: {current_url}")
|
||||
break
|
||||
|
||||
visited_urls.add(current_url)
|
||||
|
||||
try:
|
||||
response = self.session.get(
|
||||
current_url,
|
||||
timeout=self.timeout,
|
||||
allow_redirects=False
|
||||
)
|
||||
|
||||
# Check for redirect
|
||||
if response.status_code in (301, 302, 303, 307, 308):
|
||||
next_url = response.headers.get('Location')
|
||||
if not next_url:
|
||||
result.errors.append("Redirect without Location header")
|
||||
break
|
||||
|
||||
# Handle relative redirects
|
||||
if not next_url.startswith(('http://', 'https://')):
|
||||
parsed = urlparse(current_url)
|
||||
if next_url.startswith('/'):
|
||||
next_url = f"{parsed.scheme}://{parsed.netloc}{next_url}"
|
||||
else:
|
||||
next_url = urljoin(current_url, next_url)
|
||||
|
||||
# Create redirect info
|
||||
parsed_from = urlparse(current_url)
|
||||
parsed_to = urlparse(next_url)
|
||||
|
||||
redirect_info = RedirectInfo(
|
||||
from_url=current_url,
|
||||
to_url=next_url,
|
||||
status_code=response.status_code,
|
||||
is_https_upgrade=(
|
||||
parsed_from.scheme == 'http' and
|
||||
parsed_to.scheme == 'https' and
|
||||
parsed_from.netloc.replace('www.', '') == parsed_to.netloc.replace('www.', '')
|
||||
),
|
||||
is_www_redirect=(
|
||||
parsed_from.netloc.replace('www.', '') == parsed_to.netloc.replace('www.', '') and
|
||||
parsed_from.netloc != parsed_to.netloc
|
||||
)
|
||||
)
|
||||
result.redirects.append(redirect_info)
|
||||
|
||||
# Check for mixed content
|
||||
if len(result.redirects) >= 2:
|
||||
schemes = [urlparse(r.from_url).scheme for r in result.redirects]
|
||||
schemes.append(parsed_to.scheme)
|
||||
if 'http' in schemes and 'https' in schemes:
|
||||
if schemes.index('https') < len(schemes) - 1 and 'http' in schemes[schemes.index('https'):]:
|
||||
result.has_mixed_content = True
|
||||
|
||||
current_url = next_url
|
||||
|
||||
else:
|
||||
# No more redirects
|
||||
result.final_url = current_url
|
||||
break
|
||||
|
||||
except requests.exceptions.Timeout:
|
||||
result.errors.append(f"Timeout at: {current_url}")
|
||||
break
|
||||
except requests.exceptions.ConnectionError as e:
|
||||
result.errors.append(f"Connection error at {current_url}: {str(e)[:50]}")
|
||||
break
|
||||
except requests.exceptions.RequestException as e:
|
||||
result.errors.append(f"Request error: {str(e)[:100]}")
|
||||
break
|
||||
|
||||
result.chain_length = len(result.redirects)
|
||||
result.total_time_ms = int((time.time() - start_time) * 1000)
|
||||
|
||||
return result
|
||||
|
||||
def _check_canonical(self, html: str, current_url: str) -> CanonicalResult:
|
||||
"""
|
||||
Check canonical URL configuration from HTML.
|
||||
|
||||
Args:
|
||||
html: HTML content of the page.
|
||||
current_url: Current URL of the page.
|
||||
|
||||
Returns:
|
||||
CanonicalResult with canonical URL analysis.
|
||||
"""
|
||||
result = CanonicalResult()
|
||||
|
||||
try:
|
||||
soup = BeautifulSoup(html, 'lxml')
|
||||
except Exception:
|
||||
try:
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
except Exception as e:
|
||||
result.errors.append(f"Failed to parse HTML: {str(e)[:100]}")
|
||||
return result
|
||||
|
||||
# Find canonical link
|
||||
canonical_tag = soup.find('link', rel='canonical')
|
||||
|
||||
if canonical_tag:
|
||||
result.has_canonical = True
|
||||
canonical_url = canonical_tag.get('href', '')
|
||||
result.canonical_url = canonical_url
|
||||
|
||||
if canonical_url:
|
||||
# Check if relative
|
||||
result.is_relative = not canonical_url.startswith(('http://', 'https://'))
|
||||
|
||||
# Parse canonical URL
|
||||
if result.is_relative:
|
||||
# Make it absolute for comparison
|
||||
parsed_current = urlparse(current_url)
|
||||
if canonical_url.startswith('/'):
|
||||
canonical_abs = f"{parsed_current.scheme}://{parsed_current.netloc}{canonical_url}"
|
||||
else:
|
||||
canonical_abs = urljoin(current_url, canonical_url)
|
||||
else:
|
||||
canonical_abs = canonical_url
|
||||
|
||||
parsed_canonical = urlparse(canonical_abs)
|
||||
parsed_current = urlparse(current_url)
|
||||
|
||||
# Check if valid URL
|
||||
result.is_valid_url = bool(parsed_canonical.scheme and parsed_canonical.netloc)
|
||||
|
||||
# Check if self-referencing
|
||||
result.is_self_referencing = (
|
||||
parsed_canonical.netloc.replace('www.', '') == parsed_current.netloc.replace('www.', '') and
|
||||
parsed_canonical.path == parsed_current.path
|
||||
)
|
||||
|
||||
# Check if points to different domain
|
||||
result.points_to_different_domain = (
|
||||
parsed_canonical.netloc.replace('www.', '') != parsed_current.netloc.replace('www.', '')
|
||||
)
|
||||
|
||||
# Check if matches current URL exactly
|
||||
result.matches_current_url = (canonical_abs.rstrip('/') == current_url.rstrip('/'))
|
||||
|
||||
return result
|
||||
|
||||
def _check_indexability(self, response: requests.Response) -> IndexabilityResult:
|
||||
"""
|
||||
Check if page is indexable based on meta tags and HTTP headers.
|
||||
|
||||
Args:
|
||||
response: Response object from fetching the page.
|
||||
|
||||
Returns:
|
||||
IndexabilityResult with indexability analysis.
|
||||
"""
|
||||
result = IndexabilityResult()
|
||||
|
||||
# Check X-Robots-Tag HTTP header
|
||||
x_robots = response.headers.get('X-Robots-Tag', '')
|
||||
if x_robots:
|
||||
result.x_robots_tag = x_robots
|
||||
if 'noindex' in x_robots.lower():
|
||||
result.has_noindex_header = True
|
||||
result.is_indexable = False
|
||||
result.noindex_source = 'header'
|
||||
|
||||
# Check meta robots tag in HTML
|
||||
try:
|
||||
soup = BeautifulSoup(response.text, 'lxml')
|
||||
except Exception:
|
||||
try:
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
except Exception as e:
|
||||
result.errors.append(f"Failed to parse HTML: {str(e)[:100]}")
|
||||
return result
|
||||
|
||||
# Find meta robots
|
||||
meta_robots = soup.find('meta', attrs={'name': re.compile(r'^robots$', re.I)})
|
||||
if meta_robots:
|
||||
content = meta_robots.get('content', '')
|
||||
result.meta_robots_content = content
|
||||
|
||||
if 'noindex' in content.lower():
|
||||
result.has_noindex_meta = True
|
||||
result.is_indexable = False
|
||||
if not result.noindex_source:
|
||||
result.noindex_source = 'meta'
|
||||
|
||||
# Also check googlebot-specific meta
|
||||
meta_googlebot = soup.find('meta', attrs={'name': re.compile(r'^googlebot$', re.I)})
|
||||
if meta_googlebot:
|
||||
content = meta_googlebot.get('content', '')
|
||||
if 'noindex' in content.lower():
|
||||
result.has_noindex_meta = True
|
||||
result.is_indexable = False
|
||||
if not result.noindex_source:
|
||||
result.noindex_source = 'meta'
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# Convenience function
|
||||
def analyze_html(html: str, base_url: str = '') -> Dict[str, Any]:
|
||||
"""
|
||||
@ -767,20 +1407,139 @@ def analyze_html(html: str, base_url: str = '') -> Dict[str, Any]:
|
||||
return result.to_dict()
|
||||
|
||||
|
||||
def check_technical_seo(url: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Convenience function for technical SEO check.
|
||||
|
||||
Args:
|
||||
url: The URL to check.
|
||||
|
||||
Returns:
|
||||
Dict with technical SEO analysis results.
|
||||
"""
|
||||
checker = TechnicalSEOChecker()
|
||||
result = checker.check_url(url)
|
||||
return result.to_dict()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
import requests
|
||||
import argparse
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python seo_analyzer.py <url>")
|
||||
print("Example: python seo_analyzer.py https://pixlab.pl")
|
||||
sys.exit(1)
|
||||
parser = argparse.ArgumentParser(description='SEO Analyzer for websites')
|
||||
parser.add_argument('url', help='URL to analyze')
|
||||
parser.add_argument('--technical', '-t', action='store_true',
|
||||
help='Run technical SEO checks (robots.txt, sitemap, redirects)')
|
||||
parser.add_argument('--all', '-a', action='store_true',
|
||||
help='Run both on-page and technical SEO analysis')
|
||||
parser.add_argument('--json', '-j', action='store_true',
|
||||
help='Output results as JSON')
|
||||
|
||||
test_url = sys.argv[1]
|
||||
args = parser.parse_args()
|
||||
test_url = args.url
|
||||
|
||||
print(f"Analyzing: {test_url}")
|
||||
print("-" * 60)
|
||||
|
||||
# Run technical SEO checks if requested
|
||||
if args.technical or args.all:
|
||||
print("\n" + "=" * 60)
|
||||
print("TECHNICAL SEO ANALYSIS")
|
||||
print("=" * 60)
|
||||
|
||||
checker = TechnicalSEOChecker()
|
||||
tech_result = checker.check_url(test_url)
|
||||
|
||||
if args.json:
|
||||
print(json.dumps(tech_result.to_dict(), indent=2, default=str))
|
||||
else:
|
||||
print("\n=== ROBOTS.TXT ===")
|
||||
print(f"Exists: {tech_result.robots_txt.exists}")
|
||||
print(f"URL: {tech_result.robots_txt.url}")
|
||||
print(f"Status code: {tech_result.robots_txt.status_code}")
|
||||
if tech_result.robots_txt.exists:
|
||||
print(f"Disallow rules: {len(tech_result.robots_txt.disallow_rules)}")
|
||||
if tech_result.robots_txt.disallow_rules[:5]:
|
||||
print(f" Sample: {tech_result.robots_txt.disallow_rules[:5]}")
|
||||
print(f"Sitemap URLs: {tech_result.robots_txt.sitemap_urls}")
|
||||
print(f"Blocks Googlebot: {tech_result.robots_txt.blocks_googlebot}")
|
||||
print(f"Blocks all bots: {tech_result.robots_txt.blocks_all_bots}")
|
||||
if tech_result.robots_txt.crawl_delay:
|
||||
print(f"Crawl delay: {tech_result.robots_txt.crawl_delay}")
|
||||
if tech_result.robots_txt.errors:
|
||||
print(f"Errors: {tech_result.robots_txt.errors}")
|
||||
|
||||
print("\n=== SITEMAP ===")
|
||||
print(f"Exists: {tech_result.sitemap.exists}")
|
||||
print(f"URL: {tech_result.sitemap.url}")
|
||||
print(f"Status code: {tech_result.sitemap.status_code}")
|
||||
if tech_result.sitemap.exists:
|
||||
print(f"Valid XML: {tech_result.sitemap.is_valid_xml}")
|
||||
print(f"Is sitemap index: {tech_result.sitemap.is_sitemap_index}")
|
||||
if tech_result.sitemap.is_sitemap_index:
|
||||
print(f"Sitemap count: {tech_result.sitemap.sitemap_count}")
|
||||
else:
|
||||
print(f"URL count: {tech_result.sitemap.url_count}")
|
||||
if tech_result.sitemap.sample_urls:
|
||||
print(f"Sample URLs: {tech_result.sitemap.sample_urls[:3]}")
|
||||
if tech_result.sitemap.errors:
|
||||
print(f"Errors: {tech_result.sitemap.errors}")
|
||||
|
||||
print("\n=== REDIRECT CHAIN ===")
|
||||
print(f"Original URL: {tech_result.redirect_chain.original_url}")
|
||||
print(f"Final URL: {tech_result.redirect_chain.final_url}")
|
||||
print(f"Chain length: {tech_result.redirect_chain.chain_length}")
|
||||
if tech_result.redirect_chain.redirects:
|
||||
for i, r in enumerate(tech_result.redirect_chain.redirects[:5]):
|
||||
print(f" [{i+1}] {r.status_code}: {r.from_url[:50]}... -> {r.to_url[:50]}...")
|
||||
if r.is_https_upgrade:
|
||||
print(f" (HTTPS upgrade)")
|
||||
if r.is_www_redirect:
|
||||
print(f" (www redirect)")
|
||||
print(f"Has redirect loop: {tech_result.redirect_chain.has_redirect_loop}")
|
||||
print(f"Has mixed content: {tech_result.redirect_chain.has_mixed_content}")
|
||||
print(f"Total time: {tech_result.redirect_chain.total_time_ms}ms")
|
||||
if tech_result.redirect_chain.errors:
|
||||
print(f"Errors: {tech_result.redirect_chain.errors}")
|
||||
|
||||
print("\n=== CANONICAL ===")
|
||||
print(f"Has canonical: {tech_result.canonical.has_canonical}")
|
||||
if tech_result.canonical.has_canonical:
|
||||
print(f"Canonical URL: {tech_result.canonical.canonical_url}")
|
||||
print(f"Is self-referencing: {tech_result.canonical.is_self_referencing}")
|
||||
print(f"Points to different domain: {tech_result.canonical.points_to_different_domain}")
|
||||
print(f"Is relative: {tech_result.canonical.is_relative}")
|
||||
print(f"Is valid URL: {tech_result.canonical.is_valid_url}")
|
||||
if tech_result.canonical.errors:
|
||||
print(f"Errors: {tech_result.canonical.errors}")
|
||||
|
||||
print("\n=== INDEXABILITY ===")
|
||||
print(f"Is indexable: {tech_result.indexability.is_indexable}")
|
||||
print(f"Has noindex meta: {tech_result.indexability.has_noindex_meta}")
|
||||
print(f"Has noindex header: {tech_result.indexability.has_noindex_header}")
|
||||
if tech_result.indexability.noindex_source:
|
||||
print(f"Noindex source: {tech_result.indexability.noindex_source}")
|
||||
if tech_result.indexability.meta_robots_content:
|
||||
print(f"Meta robots: {tech_result.indexability.meta_robots_content}")
|
||||
if tech_result.indexability.x_robots_tag:
|
||||
print(f"X-Robots-Tag: {tech_result.indexability.x_robots_tag}")
|
||||
if tech_result.indexability.errors:
|
||||
print(f"Errors: {tech_result.indexability.errors}")
|
||||
|
||||
if tech_result.errors:
|
||||
print(f"\n=== GENERAL ERRORS ===")
|
||||
for error in tech_result.errors:
|
||||
print(f" - {error}")
|
||||
|
||||
# If only technical was requested, exit
|
||||
if not args.all:
|
||||
sys.exit(0)
|
||||
|
||||
# Run on-page analysis (default behavior)
|
||||
print("\n" + "=" * 60)
|
||||
print("ON-PAGE SEO ANALYSIS")
|
||||
print("=" * 60)
|
||||
|
||||
# Fetch the page
|
||||
try:
|
||||
headers = {
|
||||
@ -797,65 +1556,68 @@ if __name__ == '__main__':
|
||||
analyzer = OnPageSEOAnalyzer()
|
||||
result = analyzer.analyze_html(html, test_url)
|
||||
|
||||
# Print results
|
||||
print("\n=== META TAGS ===")
|
||||
print(f"Title: {result.meta_tags.title}")
|
||||
print(f"Title length: {result.meta_tags.title_length}")
|
||||
print(f"Description: {result.meta_tags.description[:100] if result.meta_tags.description else 'None'}...")
|
||||
print(f"Description length: {result.meta_tags.description_length}")
|
||||
print(f"Canonical: {result.meta_tags.canonical_url}")
|
||||
print(f"Robots: {result.meta_tags.robots}")
|
||||
print(f"Viewport: {result.meta_tags.viewport}")
|
||||
if args.json:
|
||||
print(json.dumps(result.to_dict(), indent=2, default=str))
|
||||
else:
|
||||
# Print results
|
||||
print("\n=== META TAGS ===")
|
||||
print(f"Title: {result.meta_tags.title}")
|
||||
print(f"Title length: {result.meta_tags.title_length}")
|
||||
print(f"Description: {result.meta_tags.description[:100] if result.meta_tags.description else 'None'}...")
|
||||
print(f"Description length: {result.meta_tags.description_length}")
|
||||
print(f"Canonical: {result.meta_tags.canonical_url}")
|
||||
print(f"Robots: {result.meta_tags.robots}")
|
||||
print(f"Viewport: {result.meta_tags.viewport}")
|
||||
|
||||
print("\n=== OPEN GRAPH ===")
|
||||
print(f"OG Title: {result.open_graph.og_title}")
|
||||
print(f"OG Description: {result.open_graph.og_description[:100] if result.open_graph.og_description else 'None'}...")
|
||||
print(f"OG Image: {result.open_graph.og_image}")
|
||||
print(f"OG Type: {result.open_graph.og_type}")
|
||||
print("\n=== OPEN GRAPH ===")
|
||||
print(f"OG Title: {result.open_graph.og_title}")
|
||||
print(f"OG Description: {result.open_graph.og_description[:100] if result.open_graph.og_description else 'None'}...")
|
||||
print(f"OG Image: {result.open_graph.og_image}")
|
||||
print(f"OG Type: {result.open_graph.og_type}")
|
||||
|
||||
print("\n=== TWITTER CARD ===")
|
||||
print(f"Card Type: {result.twitter_card.card_type}")
|
||||
print(f"Title: {result.twitter_card.title}")
|
||||
print("\n=== TWITTER CARD ===")
|
||||
print(f"Card Type: {result.twitter_card.card_type}")
|
||||
print(f"Title: {result.twitter_card.title}")
|
||||
|
||||
print("\n=== HEADINGS ===")
|
||||
print(f"H1: {result.headings.h1_count} ({result.headings.h1_texts})")
|
||||
print(f"H2: {result.headings.h2_count}")
|
||||
print(f"H3: {result.headings.h3_count}")
|
||||
print(f"H4: {result.headings.h4_count}")
|
||||
print(f"H5: {result.headings.h5_count}")
|
||||
print(f"H6: {result.headings.h6_count}")
|
||||
print(f"Has single H1: {result.headings.has_single_h1}")
|
||||
print(f"Has proper hierarchy: {result.headings.has_proper_hierarchy}")
|
||||
if result.headings.hierarchy_issues:
|
||||
print(f"Hierarchy issues: {result.headings.hierarchy_issues}")
|
||||
print("\n=== HEADINGS ===")
|
||||
print(f"H1: {result.headings.h1_count} ({result.headings.h1_texts})")
|
||||
print(f"H2: {result.headings.h2_count}")
|
||||
print(f"H3: {result.headings.h3_count}")
|
||||
print(f"H4: {result.headings.h4_count}")
|
||||
print(f"H5: {result.headings.h5_count}")
|
||||
print(f"H6: {result.headings.h6_count}")
|
||||
print(f"Has single H1: {result.headings.has_single_h1}")
|
||||
print(f"Has proper hierarchy: {result.headings.has_proper_hierarchy}")
|
||||
if result.headings.hierarchy_issues:
|
||||
print(f"Hierarchy issues: {result.headings.hierarchy_issues}")
|
||||
|
||||
print("\n=== IMAGES ===")
|
||||
print(f"Total images: {result.images.total_images}")
|
||||
print(f"With alt: {result.images.images_with_alt}")
|
||||
print(f"Without alt: {result.images.images_without_alt}")
|
||||
print(f"With empty alt: {result.images.images_with_empty_alt}")
|
||||
if result.images.alt_text_quality_issues:
|
||||
print(f"Alt quality issues: {len(result.images.alt_text_quality_issues)}")
|
||||
print("\n=== IMAGES ===")
|
||||
print(f"Total images: {result.images.total_images}")
|
||||
print(f"With alt: {result.images.images_with_alt}")
|
||||
print(f"Without alt: {result.images.images_without_alt}")
|
||||
print(f"With empty alt: {result.images.images_with_empty_alt}")
|
||||
if result.images.alt_text_quality_issues:
|
||||
print(f"Alt quality issues: {len(result.images.alt_text_quality_issues)}")
|
||||
|
||||
print("\n=== LINKS ===")
|
||||
print(f"Total links: {result.links.total_links}")
|
||||
print(f"Internal: {result.links.internal_links}")
|
||||
print(f"External: {result.links.external_links}")
|
||||
print(f"Nofollow: {result.links.nofollow_links}")
|
||||
print(f"Broken anchor links: {result.links.broken_anchor_links}")
|
||||
print(f"External domains: {result.links.unique_external_domains[:5]}")
|
||||
print("\n=== LINKS ===")
|
||||
print(f"Total links: {result.links.total_links}")
|
||||
print(f"Internal: {result.links.internal_links}")
|
||||
print(f"External: {result.links.external_links}")
|
||||
print(f"Nofollow: {result.links.nofollow_links}")
|
||||
print(f"Broken anchor links: {result.links.broken_anchor_links}")
|
||||
print(f"External domains: {result.links.unique_external_domains[:5]}")
|
||||
|
||||
print("\n=== STRUCTURED DATA ===")
|
||||
print(f"Has structured data: {result.structured_data.has_structured_data}")
|
||||
print(f"JSON-LD count: {result.structured_data.json_ld_count}")
|
||||
print(f"Microdata count: {result.structured_data.microdata_count}")
|
||||
print(f"RDFa count: {result.structured_data.rdfa_count}")
|
||||
print(f"Schema types: {result.structured_data.all_types}")
|
||||
print("\n=== STRUCTURED DATA ===")
|
||||
print(f"Has structured data: {result.structured_data.has_structured_data}")
|
||||
print(f"JSON-LD count: {result.structured_data.json_ld_count}")
|
||||
print(f"Microdata count: {result.structured_data.microdata_count}")
|
||||
print(f"RDFa count: {result.structured_data.rdfa_count}")
|
||||
print(f"Schema types: {result.structured_data.all_types}")
|
||||
|
||||
print("\n=== OTHER ===")
|
||||
print(f"Word count: {result.word_count}")
|
||||
print(f"Has DOCTYPE: {result.has_doctype}")
|
||||
print(f"Has lang attribute: {result.has_lang_attribute} ({result.lang_attribute})")
|
||||
print("\n=== OTHER ===")
|
||||
print(f"Word count: {result.word_count}")
|
||||
print(f"Has DOCTYPE: {result.has_doctype}")
|
||||
print(f"Has lang attribute: {result.has_lang_attribute} ({result.lang_attribute})")
|
||||
|
||||
if result.errors:
|
||||
print(f"\nErrors: {result.errors}")
|
||||
if result.errors:
|
||||
print(f"\nErrors: {result.errors}")
|
||||
|
||||
Loading…
Reference in New Issue
Block a user