feat(audit): Phase 1 - YouTube API, CrUX field data, security headers, image formats
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions

New services:
- youtube_service.py: YouTube Data API v3 integration for channel stats
  (subscriber count, view count, video count)
- crux_service.py: Chrome UX Report API for real user field data
  (INP, LCP, CLS, FCP, TTFB from actual Chrome users)

SEO audit enrichment:
- Security headers check: HSTS, CSP, X-Frame-Options, X-Content-Type-Options
  via live requests.head() during data collection
- Image format analysis: WebP/AVIF/SVG vs legacy JPEG/PNG ratio
- CrUX field data complements existing PageSpeed lab data in AI prompt
- All new metrics passed to Gemini for richer analysis

Social media audit enrichment:
- YouTube API data (video count, views, subscribers) integrated into
  social media AI prompt when YouTube profile exists

All APIs use existing GOOGLE_PLACES_API_KEY (free tier, $0 cost).
Completeness: ~68% → ~78% (estimated)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Maciej Pienczyn 2026-02-08 11:32:03 +01:00
parent b1438dd514
commit ce6aa53c78
4 changed files with 394 additions and 11 deletions

View File

@ -19,6 +19,8 @@ Created: 2026-02-07
import hashlib import hashlib
import json import json
import logging import logging
import re
import requests
from datetime import datetime, timedelta from datetime import datetime, timedelta
from html import unescape from html import unescape
@ -26,6 +28,8 @@ from database import (
SessionLocal, Company, CompanyWebsiteAnalysis, CompanySocialMedia, SessionLocal, Company, CompanyWebsiteAnalysis, CompanySocialMedia,
CompanyCitation, AuditAction, AuditAICache CompanyCitation, AuditAction, AuditAICache
) )
from youtube_service import YouTubeService
from crux_service import CrUXService
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -97,6 +101,60 @@ def _collect_seo_data(db, company) -> dict:
CompanyCitation.company_id == company.id CompanyCitation.company_id == company.id
).all() ).all()
# Security headers check (live request)
security_headers = {}
if company.website:
try:
resp = requests.head(company.website, timeout=5, allow_redirects=True)
headers = resp.headers
security_headers = {
'has_hsts': 'strict-transport-security' in headers,
'has_csp': 'content-security-policy' in headers,
'has_x_frame_options': 'x-frame-options' in headers,
'has_x_content_type_options': 'x-content-type-options' in headers,
'security_headers_count': sum([
'strict-transport-security' in headers,
'content-security-policy' in headers,
'x-frame-options' in headers,
'x-content-type-options' in headers,
]),
}
except Exception:
pass
# Image format analysis (from existing data if available)
image_formats = {}
if company.website:
try:
resp = requests.get(company.website, timeout=10, allow_redirects=True)
if resp.status_code == 200:
img_srcs = re.findall(r'<img[^>]+src=["\']([^"\']+)["\']', resp.text, re.IGNORECASE)
webp_count = sum(1 for s in img_srcs if '.webp' in s.lower())
avif_count = sum(1 for s in img_srcs if '.avif' in s.lower())
svg_count = sum(1 for s in img_srcs if '.svg' in s.lower())
modern_count = webp_count + avif_count + svg_count
legacy_count = len(img_srcs) - modern_count
image_formats = {
'total_images_found': len(img_srcs),
'webp_count': webp_count,
'avif_count': avif_count,
'svg_count': svg_count,
'modern_format_count': modern_count,
'legacy_format_count': legacy_count,
'modern_format_ratio': round(modern_count / len(img_srcs) * 100, 1) if img_srcs else 0,
}
except Exception:
pass
# CrUX field data (real user metrics from Chrome)
crux_data = {}
if company.website:
try:
crux = CrUXService()
crux_data = crux.get_field_data(company.website) or {}
except Exception as e:
logger.warning(f"CrUX error for {company.website}: {e}")
return { return {
'company_name': company.name, 'company_name': company.name,
'company_category': company.category.name if company.category else None, 'company_category': company.category.name if company.category else None,
@ -161,6 +219,25 @@ def _collect_seo_data(db, company) -> dict:
# Citations # Citations
'citations_count': len(citations), 'citations_count': len(citations),
'citations_found': len([c for c in citations if c.status == 'found']), 'citations_found': len([c for c in citations if c.status == 'found']),
# Security headers
'has_hsts': security_headers.get('has_hsts', None),
'has_csp': security_headers.get('has_csp', None),
'has_x_frame_options': security_headers.get('has_x_frame_options', None),
'has_x_content_type_options': security_headers.get('has_x_content_type_options', None),
'security_headers_count': security_headers.get('security_headers_count', None),
# Image formats
'modern_format_ratio': image_formats.get('modern_format_ratio', None),
'webp_count': image_formats.get('webp_count', None),
'legacy_image_count': image_formats.get('legacy_format_count', None),
# CrUX field data (real user metrics)
'crux_lcp_ms': crux_data.get('crux_lcp_ms'),
'crux_inp_ms': crux_data.get('crux_inp_ms'),
'crux_cls': crux_data.get('crux_cls'),
'crux_fcp_ms': crux_data.get('crux_fcp_ms'),
'crux_ttfb_ms': crux_data.get('crux_ttfb_ms'),
'crux_lcp_good_pct': crux_data.get('crux_lcp_ms_good_pct'),
'crux_inp_good_pct': crux_data.get('crux_inp_ms_good_pct'),
'crux_period_end': crux_data.get('crux_period_end'),
} }
@ -256,6 +333,21 @@ def _collect_social_data(db, company) -> dict:
present = [p for p in all_platforms if p in profiles_dict] present = [p for p in all_platforms if p in profiles_dict]
missing = [p for p in all_platforms if p not in profiles_dict] missing = [p for p in all_platforms if p not in profiles_dict]
# Fetch YouTube API data if profile exists
youtube_data = None
if 'youtube' in profiles_dict and profiles_dict['youtube'].get('url'):
try:
yt_service = YouTubeService()
channel_id = yt_service.extract_channel_id_from_url(profiles_dict['youtube']['url'])
if channel_id:
youtube_data = yt_service.get_channel_stats(channel_id)
if youtube_data:
profiles_dict['youtube']['subscriber_count'] = youtube_data.get('subscriber_count')
profiles_dict['youtube']['view_count'] = youtube_data.get('view_count')
profiles_dict['youtube']['video_count'] = youtube_data.get('video_count')
except Exception as e:
logger.warning(f"YouTube API error: {e}")
return { return {
'company_name': company.name, 'company_name': company.name,
'company_category': company.category.name if company.category else None, 'company_category': company.category.name if company.category else None,
@ -289,12 +381,21 @@ WYNIKI AUDYTU SEO:
- Dostępność: {data.get('accessibility_score', 'brak')}/100 - Dostępność: {data.get('accessibility_score', 'brak')}/100
- Best Practices: {data.get('best_practices_score', 'brak')}/100 - Best Practices: {data.get('best_practices_score', 'brak')}/100
Core Web Vitals: Core Web Vitals (lab data z PageSpeed):
- LCP: {data.get('lcp_ms', 'brak')} ms - LCP: {data.get('lcp_ms', 'brak')} ms
- INP: {data.get('inp_ms', 'brak')} ms (zastąpił FID w marcu 2024) - INP: {data.get('inp_ms', 'brak')} ms (zastąpił FID w marcu 2024)
- CLS: {data.get('cls', 'brak')} - CLS: {data.get('cls', 'brak')}
Dodatkowe metryki wydajności: CrUX Field Data (dane od realnych użytkowników Chrome):
- LCP (field): {data.get('crux_lcp_ms', 'brak danych')} ms ({data.get('crux_lcp_good_pct', '?')}% dobrych)
- INP (field): {data.get('crux_inp_ms', 'brak danych')} ms ({data.get('crux_inp_good_pct', '?')}% dobrych)
- CLS (field): {data.get('crux_cls', 'brak danych')}
- FCP (field): {data.get('crux_fcp_ms', 'brak danych')} ms
- TTFB (field): {data.get('crux_ttfb_ms', 'brak danych')} ms
- Okres pomiarowy: do {data.get('crux_period_end', 'brak')}
UWAGA: "brak danych" oznacza, że strona nie ma wystarczającego ruchu z Chrome do raportowania CrUX.
Dodatkowe metryki wydajności (lab data):
- FCP: {data.get('fcp_ms', 'brak')} ms - FCP: {data.get('fcp_ms', 'brak')} ms
- TTFB: {data.get('ttfb_ms', 'brak')} ms - TTFB: {data.get('ttfb_ms', 'brak')} ms
- TBT: {data.get('tbt_ms', 'brak')} ms - TBT: {data.get('tbt_ms', 'brak')} ms
@ -317,6 +418,13 @@ Technical SEO:
- Indeksowalna: {'tak' if data.get('is_indexable') else 'NIE'} - Indeksowalna: {'tak' if data.get('is_indexable') else 'NIE'}
- Mobile-friendly: {'tak' if data.get('is_mobile_friendly') else 'NIE/brak danych'} - Mobile-friendly: {'tak' if data.get('is_mobile_friendly') else 'NIE/brak danych'}
Security Headers:
- HSTS: {'tak' if data.get('has_hsts') else 'NIE' if data.get('has_hsts') is not None else 'brak danych'}
- CSP: {'tak' if data.get('has_csp') else 'NIE' if data.get('has_csp') is not None else 'brak danych'}
- X-Frame-Options: {'tak' if data.get('has_x_frame_options') else 'NIE' if data.get('has_x_frame_options') is not None else 'brak danych'}
- X-Content-Type-Options: {'tak' if data.get('has_x_content_type_options') else 'NIE' if data.get('has_x_content_type_options') is not None else 'brak danych'}
- Nagłówki bezpieczeństwa: {data.get('security_headers_count', '?')}/4
Dane strukturalne: Dane strukturalne:
- Schema.org: {'tak' if data.get('has_structured_data') else 'NIE'} (typy: {data.get('structured_data_types', [])}) - Schema.org: {'tak' if data.get('has_structured_data') else 'NIE'} (typy: {data.get('structured_data_types', [])})
- LocalBusiness Schema: {'tak' if data.get('has_local_business_schema') else 'NIE'} - LocalBusiness Schema: {'tak' if data.get('has_local_business_schema') else 'NIE'}
@ -339,6 +447,10 @@ Treść:
- Świeżość: {data.get('content_freshness_score', 'brak')}/100 - Świeżość: {data.get('content_freshness_score', 'brak')}/100
- Słów na stronie głównej: {data.get('word_count_homepage', 'brak')} - Słów na stronie głównej: {data.get('word_count_homepage', 'brak')}
Formaty obrazów:
- Nowoczesne (WebP/AVIF/SVG): {data.get('modern_format_ratio', '?')}% ({data.get('webp_count', 0)} WebP)
- Legacy (JPEG/PNG): {data.get('legacy_image_count', '?')} obrazów
ZADANIE: ZADANIE:
Przygotuj analizę w formacie JSON z dwoma kluczami: Przygotuj analizę w formacie JSON z dwoma kluczami:
@ -346,7 +458,7 @@ Przygotuj analizę w formacie JSON z dwoma kluczami:
2. "actions" - lista od 3 do 8 priorytetowanych akcji do podjęcia. Każda akcja to obiekt: 2. "actions" - lista od 3 do 8 priorytetowanych akcji do podjęcia. Każda akcja to obiekt:
{{ {{
"action_type": "typ akcji z listy: generate_schema_org, generate_meta_description, suggest_heading_fix, generate_alt_texts, seo_roadmap, add_analytics, add_sitemap, fix_ssl, add_og_tags, improve_performance, add_local_keywords, add_nap, fix_broken_links", "action_type": "typ akcji z listy: generate_schema_org, generate_meta_description, suggest_heading_fix, generate_alt_texts, seo_roadmap, add_analytics, add_sitemap, fix_ssl, add_og_tags, improve_performance, add_local_keywords, add_nap, fix_broken_links, improve_security_headers, optimize_images",
"title": "krótki tytuł po polsku", "title": "krótki tytuł po polsku",
"description": "opis co trzeba zrobić i dlaczego, 1-2 zdania", "description": "opis co trzeba zrobić i dlaczego, 1-2 zdania",
"priority": "critical/high/medium/low", "priority": "critical/high/medium/low",
@ -453,6 +565,12 @@ def _build_social_prompt(data: dict) -> str:
if info.get('last_post_date'): if info.get('last_post_date'):
profiles_info += f", ost.post={info.get('last_post_date')}" profiles_info += f", ost.post={info.get('last_post_date')}"
# YouTube metrics from API
if platform == 'youtube' and info.get('video_count') is not None:
profiles_info += f", filmy={info.get('video_count')}"
profiles_info += f", wyświetlenia={info.get('view_count', '?')}"
profiles_info += f", subskrybenci={info.get('subscriber_count', '?')}"
# Collect engagement rates for average calculation # Collect engagement rates for average calculation
if info.get('engagement_rate'): if info.get('engagement_rate'):
engagement_rates.append(info.get('engagement_rate')) engagement_rates.append(info.get('engagement_rate'))

104
crux_service.py Normal file
View File

@ -0,0 +1,104 @@
"""Chrome UX Report (CrUX) API Service.
Pobiera field data (dane od realnych użytkowników Chrome) dla stron internetowych.
Uzupełnia lab data z PageSpeed Insights o metryki z rzeczywistego ruchu.
API: https://chromeuxreport.googleapis.com/v1/records:queryRecord
Free tier: 150 requests/minute
"""
import os
import logging
import requests
logger = logging.getLogger(__name__)
class CrUXService:
"""Service for Chrome UX Report API."""
BASE_URL = 'https://chromeuxreport.googleapis.com/v1/records:queryRecord'
def __init__(self, api_key: str = None):
self.api_key = api_key or os.environ.get('GOOGLE_PLACES_API_KEY')
if not self.api_key:
logger.warning("CrUX API key not configured (GOOGLE_PLACES_API_KEY)")
def get_field_data(self, url: str) -> dict | None:
"""Fetch CrUX field data for a URL.
Args:
url: Website URL (e.g., 'https://example.com')
Returns:
Dict with field metrics or None if no data available.
Many small/local business sites won't have CrUX data.
"""
if not self.api_key:
return None
try:
# Try origin-level first (more likely to have data)
response = requests.post(
f"{self.BASE_URL}?key={self.api_key}",
json={'origin': url.rstrip('/')},
timeout=10
)
if response.status_code == 404:
# No CrUX data for this origin (common for small sites)
return None
if response.status_code != 200:
logger.warning(f"CrUX API error {response.status_code} for {url}")
return None
data = response.json()
record = data.get('record', {})
metrics = record.get('metrics', {})
result = {}
# Extract each metric's p75 value
metric_mapping = {
'largest_contentful_paint': 'crux_lcp_ms',
'interaction_to_next_paint': 'crux_inp_ms',
'cumulative_layout_shift': 'crux_cls',
'first_contentful_paint': 'crux_fcp_ms',
'time_to_first_byte': 'crux_ttfb_ms',
}
for api_name, our_name in metric_mapping.items():
metric = metrics.get(api_name, {})
percentiles = metric.get('percentiles', {})
p75 = percentiles.get('p75')
if p75 is not None:
# CLS is reported as decimal (e.g., 0.15), others in ms
if 'layout_shift' in api_name:
result[our_name] = round(float(p75), 3)
else:
result[our_name] = int(p75)
# Also extract histogram category distribution
histogram = metric.get('histogram', [])
if histogram and len(histogram) >= 3:
total = sum(h.get('density', 0) for h in histogram)
if total > 0:
good_pct = round(histogram[0].get('density', 0) * 100, 1)
result[f'{our_name}_good_pct'] = good_pct
# Collection period
collection_period = record.get('collectionPeriod', {})
if collection_period:
last_date = collection_period.get('lastDate', {})
if last_date:
result['crux_period_end'] = f"{last_date.get('year')}-{last_date.get('month'):02d}-{last_date.get('day'):02d}"
return result if result else None
except requests.exceptions.Timeout:
logger.warning(f"CrUX API timeout for {url}")
return None
except Exception as e:
logger.warning(f"CrUX API error for {url}: {e}")
return None

View File

@ -6,14 +6,15 @@
## Stan Implementacji ## Stan Implementacji
### Faza 0: Quick Wins (1-3 dni, $0) — W TRAKCIE ### Faza 0: Quick Wins (1-3 dni, $0) — UKOŃCZONA (2026-02-08)
- [ ] **GBP bugfix:** review_response_rate sprawdza `authorAttribution.displayName` zamiast `ownerResponse` → zawsze fałszywe dane (gbp_audit_service.py) - [x] **GBP bugfix:** review_response_rate — naprawiono: sprawdza `ownerResponse` zamiast `authorAttribution.displayName`
- [ ] **GBP phantom fields:** has_posts, has_products, has_qa nigdy nie wypełniane → oznaczyć jako "niedostępne bez OAuth" w _build_gbp_prompt() - [x] **GBP phantom fields:** has_posts, has_products, has_qa oznaczone jako `[dane niedostępne bez autoryzacji OAuth]`
- [ ] **SEO: FID→INP:** FID deprecated marzec 2024, INP nie zbierany. Dostępny w `loadingExperience.metrics.INTERACTION_TO_NEXT_PAINT` z PageSpeed API - [x] **GBP prompt:** dodano review_keywords i description_keywords do promptu AI
- [ ] **SEO: 10 metryk do promptu:** FCP, TTFB, TBT, Speed Index, load_time_ms, meta title/desc length, schema details, html lang — JUŻ W DB ale nie w prompcie AI - [x] **SEO: FID→INP:** zastąpiono FID przez INP w prompcie i szablonach (progi: 200ms/500ms)
- [ ] **Social: engagement_rate** — pole w DB istnieje, nigdy nie obliczane. Formuła: estimated base_rate × activity_multiplier - [x] **SEO: 10 metryk do promptu:** FCP, TTFB, TBT, Speed Index, meta title/desc length, schema details, html lang
- [ ] **Social: posting_frequency_score** — pole w DB, nigdy nie obliczane. 0-10 based on posts_count_30d - [x] **Social: engagement_rate** — obliczane z industry base_rate × activity_multiplier
- [ ] **Social: enrichment promptu** — dodać last_post_date, page_name, engagement metrics - [x] **Social: posting_frequency_score** — 0-10 based on posts_count_30d
- [x] **Social: enrichment promptu** — last_post_date, page_name, engagement metrics, brand consistency
**Agenci Phase 0 (team: phase0-quickwins):** **Agenci Phase 0 (team: phase0-quickwins):**
- gbp-fixer: Fix review_response_rate + GBP prompt enrichment - gbp-fixer: Fix review_response_rate + GBP prompt enrichment

160
youtube_service.py Normal file
View File

@ -0,0 +1,160 @@
"""
YouTube Data API v3 Service for NordaBiz
=========================================
Simple YouTube API client for fetching channel statistics.
Uses the YouTube Data API v3 with the same Google API key as Places API.
API Reference: https://developers.google.com/youtube/v3/docs/channels
Author: NordaBiz Development Team
Created: 2026-02-08
"""
import os
import re
import logging
from typing import Optional, Dict
import requests
logger = logging.getLogger(__name__)
# API Configuration
YOUTUBE_API_BASE = "https://www.googleapis.com/youtube/v3"
class YouTubeService:
"""Fetches YouTube channel statistics via YouTube Data API v3."""
def __init__(self, api_key: str = None):
self.api_key = api_key or os.getenv('GOOGLE_PLACES_API_KEY')
if not self.api_key:
raise ValueError("GOOGLE_PLACES_API_KEY not set in environment")
self.session = requests.Session()
def extract_channel_id_from_url(self, url: str) -> Optional[str]:
"""
Extract channel ID or handle from YouTube URL.
Supported formats:
- youtube.com/channel/UC1234567890abcdef
- youtube.com/@handle
- youtube.com/c/channelname
- youtube.com/user/username
Args:
url: YouTube channel URL
Returns:
Channel ID (starts with UC) or handle (without @) or None
"""
if not url:
return None
# Direct channel ID (UC...)
match = re.search(r'youtube\.com/channel/([A-Za-z0-9_-]+)', url)
if match:
return match.group(1)
# Handle (@username)
match = re.search(r'youtube\.com/@([A-Za-z0-9_-]+)', url)
if match:
return match.group(1) # Return without @
# Legacy /c/ and /user/ formats
match = re.search(r'youtube\.com/(?:c|user)/([A-Za-z0-9_-]+)', url)
if match:
return match.group(1)
logger.warning(f"Unable to extract channel ID from URL: {url}")
return None
def get_channel_stats(self, channel_id_or_username: str) -> Optional[Dict]:
"""
Fetch channel statistics from YouTube Data API v3.
Args:
channel_id_or_username: YouTube channel ID (UC...) or username/handle
Returns:
Dict with channel stats or None on error:
{
'subscriber_count': int,
'view_count': int,
'video_count': int,
'channel_title': str,
'channel_description': str
}
"""
if not channel_id_or_username:
return None
url = f"{YOUTUBE_API_BASE}/channels"
# Determine if it's a channel ID (starts with UC) or handle/username
if channel_id_or_username.startswith('UC'):
params = {
'part': 'statistics,snippet',
'id': channel_id_or_username,
'key': self.api_key
}
else:
# For handles, we need to use forHandle (modern) or forUsername (legacy)
params = {
'part': 'statistics,snippet',
'forHandle': channel_id_or_username,
'key': self.api_key
}
try:
response = self.session.get(url, params=params, timeout=15)
response.raise_for_status()
data = response.json()
items = data.get('items', [])
if not items:
# Try forUsername as fallback
if not channel_id_or_username.startswith('UC'):
params = {
'part': 'statistics,snippet',
'forUsername': channel_id_or_username,
'key': self.api_key
}
response = self.session.get(url, params=params, timeout=15)
response.raise_for_status()
data = response.json()
items = data.get('items', [])
if not items:
logger.warning(f"No YouTube channel found for: {channel_id_or_username}")
return None
channel = items[0]
stats = channel.get('statistics', {})
snippet = channel.get('snippet', {})
result = {
'subscriber_count': int(stats.get('subscriberCount', 0)),
'view_count': int(stats.get('viewCount', 0)),
'video_count': int(stats.get('videoCount', 0)),
'channel_title': snippet.get('title', ''),
'channel_description': snippet.get('description', '')
}
logger.info(f"Fetched YouTube stats for {result['channel_title']}: "
f"{result['subscriber_count']} subscribers, "
f"{result['video_count']} videos")
return result
except requests.exceptions.HTTPError as e:
logger.warning(f"YouTube API HTTP error for {channel_id_or_username}: "
f"{e.response.status_code} - {e.response.text}")
return None
except requests.exceptions.RequestException as e:
logger.warning(f"YouTube API request error for {channel_id_or_username}: {e}")
return None
except (KeyError, ValueError, TypeError) as e:
logger.warning(f"YouTube API response parse error: {e}")
return None