From ce6aa53c7895357f3d7e19f1835174da6ebad8dd Mon Sep 17 00:00:00 2001 From: Maciej Pienczyn Date: Sun, 8 Feb 2026 11:32:03 +0100 Subject: [PATCH] feat(audit): Phase 1 - YouTube API, CrUX field data, security headers, image formats MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New services: - youtube_service.py: YouTube Data API v3 integration for channel stats (subscriber count, view count, video count) - crux_service.py: Chrome UX Report API for real user field data (INP, LCP, CLS, FCP, TTFB from actual Chrome users) SEO audit enrichment: - Security headers check: HSTS, CSP, X-Frame-Options, X-Content-Type-Options via live requests.head() during data collection - Image format analysis: WebP/AVIF/SVG vs legacy JPEG/PNG ratio - CrUX field data complements existing PageSpeed lab data in AI prompt - All new metrics passed to Gemini for richer analysis Social media audit enrichment: - YouTube API data (video count, views, subscribers) integrated into social media AI prompt when YouTube profile exists All APIs use existing GOOGLE_PLACES_API_KEY (free tier, $0 cost). Completeness: ~68% → ~78% (estimated) Co-Authored-By: Claude Opus 4.6 --- audit_ai_service.py | 124 ++++++++++++++++++++++++- crux_service.py | 104 +++++++++++++++++++++ docs/AUDIT_COMPLETENESS_PLAN.md | 17 ++-- youtube_service.py | 160 ++++++++++++++++++++++++++++++++ 4 files changed, 394 insertions(+), 11 deletions(-) create mode 100644 crux_service.py create mode 100644 youtube_service.py diff --git a/audit_ai_service.py b/audit_ai_service.py index 263dd15..63dbfe1 100644 --- a/audit_ai_service.py +++ b/audit_ai_service.py @@ -19,6 +19,8 @@ Created: 2026-02-07 import hashlib import json import logging +import re +import requests from datetime import datetime, timedelta from html import unescape @@ -26,6 +28,8 @@ from database import ( SessionLocal, Company, CompanyWebsiteAnalysis, CompanySocialMedia, CompanyCitation, AuditAction, AuditAICache ) +from youtube_service import YouTubeService +from crux_service import CrUXService logger = logging.getLogger(__name__) @@ -97,6 +101,60 @@ def _collect_seo_data(db, company) -> dict: CompanyCitation.company_id == company.id ).all() + # Security headers check (live request) + security_headers = {} + if company.website: + try: + resp = requests.head(company.website, timeout=5, allow_redirects=True) + headers = resp.headers + security_headers = { + 'has_hsts': 'strict-transport-security' in headers, + 'has_csp': 'content-security-policy' in headers, + 'has_x_frame_options': 'x-frame-options' in headers, + 'has_x_content_type_options': 'x-content-type-options' in headers, + 'security_headers_count': sum([ + 'strict-transport-security' in headers, + 'content-security-policy' in headers, + 'x-frame-options' in headers, + 'x-content-type-options' in headers, + ]), + } + except Exception: + pass + + # Image format analysis (from existing data if available) + image_formats = {} + if company.website: + try: + resp = requests.get(company.website, timeout=10, allow_redirects=True) + if resp.status_code == 200: + img_srcs = re.findall(r']+src=["\']([^"\']+)["\']', resp.text, re.IGNORECASE) + webp_count = sum(1 for s in img_srcs if '.webp' in s.lower()) + avif_count = sum(1 for s in img_srcs if '.avif' in s.lower()) + svg_count = sum(1 for s in img_srcs if '.svg' in s.lower()) + modern_count = webp_count + avif_count + svg_count + legacy_count = len(img_srcs) - modern_count + image_formats = { + 'total_images_found': len(img_srcs), + 'webp_count': webp_count, + 'avif_count': avif_count, + 'svg_count': svg_count, + 'modern_format_count': modern_count, + 'legacy_format_count': legacy_count, + 'modern_format_ratio': round(modern_count / len(img_srcs) * 100, 1) if img_srcs else 0, + } + except Exception: + pass + + # CrUX field data (real user metrics from Chrome) + crux_data = {} + if company.website: + try: + crux = CrUXService() + crux_data = crux.get_field_data(company.website) or {} + except Exception as e: + logger.warning(f"CrUX error for {company.website}: {e}") + return { 'company_name': company.name, 'company_category': company.category.name if company.category else None, @@ -161,6 +219,25 @@ def _collect_seo_data(db, company) -> dict: # Citations 'citations_count': len(citations), 'citations_found': len([c for c in citations if c.status == 'found']), + # Security headers + 'has_hsts': security_headers.get('has_hsts', None), + 'has_csp': security_headers.get('has_csp', None), + 'has_x_frame_options': security_headers.get('has_x_frame_options', None), + 'has_x_content_type_options': security_headers.get('has_x_content_type_options', None), + 'security_headers_count': security_headers.get('security_headers_count', None), + # Image formats + 'modern_format_ratio': image_formats.get('modern_format_ratio', None), + 'webp_count': image_formats.get('webp_count', None), + 'legacy_image_count': image_formats.get('legacy_format_count', None), + # CrUX field data (real user metrics) + 'crux_lcp_ms': crux_data.get('crux_lcp_ms'), + 'crux_inp_ms': crux_data.get('crux_inp_ms'), + 'crux_cls': crux_data.get('crux_cls'), + 'crux_fcp_ms': crux_data.get('crux_fcp_ms'), + 'crux_ttfb_ms': crux_data.get('crux_ttfb_ms'), + 'crux_lcp_good_pct': crux_data.get('crux_lcp_ms_good_pct'), + 'crux_inp_good_pct': crux_data.get('crux_inp_ms_good_pct'), + 'crux_period_end': crux_data.get('crux_period_end'), } @@ -256,6 +333,21 @@ def _collect_social_data(db, company) -> dict: present = [p for p in all_platforms if p in profiles_dict] missing = [p for p in all_platforms if p not in profiles_dict] + # Fetch YouTube API data if profile exists + youtube_data = None + if 'youtube' in profiles_dict and profiles_dict['youtube'].get('url'): + try: + yt_service = YouTubeService() + channel_id = yt_service.extract_channel_id_from_url(profiles_dict['youtube']['url']) + if channel_id: + youtube_data = yt_service.get_channel_stats(channel_id) + if youtube_data: + profiles_dict['youtube']['subscriber_count'] = youtube_data.get('subscriber_count') + profiles_dict['youtube']['view_count'] = youtube_data.get('view_count') + profiles_dict['youtube']['video_count'] = youtube_data.get('video_count') + except Exception as e: + logger.warning(f"YouTube API error: {e}") + return { 'company_name': company.name, 'company_category': company.category.name if company.category else None, @@ -289,12 +381,21 @@ WYNIKI AUDYTU SEO: - Dostępność: {data.get('accessibility_score', 'brak')}/100 - Best Practices: {data.get('best_practices_score', 'brak')}/100 -Core Web Vitals: +Core Web Vitals (lab data z PageSpeed): - LCP: {data.get('lcp_ms', 'brak')} ms - INP: {data.get('inp_ms', 'brak')} ms (zastąpił FID w marcu 2024) - CLS: {data.get('cls', 'brak')} -Dodatkowe metryki wydajności: +CrUX Field Data (dane od realnych użytkowników Chrome): +- LCP (field): {data.get('crux_lcp_ms', 'brak danych')} ms ({data.get('crux_lcp_good_pct', '?')}% dobrych) +- INP (field): {data.get('crux_inp_ms', 'brak danych')} ms ({data.get('crux_inp_good_pct', '?')}% dobrych) +- CLS (field): {data.get('crux_cls', 'brak danych')} +- FCP (field): {data.get('crux_fcp_ms', 'brak danych')} ms +- TTFB (field): {data.get('crux_ttfb_ms', 'brak danych')} ms +- Okres pomiarowy: do {data.get('crux_period_end', 'brak')} +UWAGA: "brak danych" oznacza, że strona nie ma wystarczającego ruchu z Chrome do raportowania CrUX. + +Dodatkowe metryki wydajności (lab data): - FCP: {data.get('fcp_ms', 'brak')} ms - TTFB: {data.get('ttfb_ms', 'brak')} ms - TBT: {data.get('tbt_ms', 'brak')} ms @@ -317,6 +418,13 @@ Technical SEO: - Indeksowalna: {'tak' if data.get('is_indexable') else 'NIE'} - Mobile-friendly: {'tak' if data.get('is_mobile_friendly') else 'NIE/brak danych'} +Security Headers: +- HSTS: {'tak' if data.get('has_hsts') else 'NIE' if data.get('has_hsts') is not None else 'brak danych'} +- CSP: {'tak' if data.get('has_csp') else 'NIE' if data.get('has_csp') is not None else 'brak danych'} +- X-Frame-Options: {'tak' if data.get('has_x_frame_options') else 'NIE' if data.get('has_x_frame_options') is not None else 'brak danych'} +- X-Content-Type-Options: {'tak' if data.get('has_x_content_type_options') else 'NIE' if data.get('has_x_content_type_options') is not None else 'brak danych'} +- Nagłówki bezpieczeństwa: {data.get('security_headers_count', '?')}/4 + Dane strukturalne: - Schema.org: {'tak' if data.get('has_structured_data') else 'NIE'} (typy: {data.get('structured_data_types', [])}) - LocalBusiness Schema: {'tak' if data.get('has_local_business_schema') else 'NIE'} @@ -339,6 +447,10 @@ Treść: - Świeżość: {data.get('content_freshness_score', 'brak')}/100 - Słów na stronie głównej: {data.get('word_count_homepage', 'brak')} +Formaty obrazów: +- Nowoczesne (WebP/AVIF/SVG): {data.get('modern_format_ratio', '?')}% ({data.get('webp_count', 0)} WebP) +- Legacy (JPEG/PNG): {data.get('legacy_image_count', '?')} obrazów + ZADANIE: Przygotuj analizę w formacie JSON z dwoma kluczami: @@ -346,7 +458,7 @@ Przygotuj analizę w formacie JSON z dwoma kluczami: 2. "actions" - lista od 3 do 8 priorytetowanych akcji do podjęcia. Każda akcja to obiekt: {{ - "action_type": "typ akcji z listy: generate_schema_org, generate_meta_description, suggest_heading_fix, generate_alt_texts, seo_roadmap, add_analytics, add_sitemap, fix_ssl, add_og_tags, improve_performance, add_local_keywords, add_nap, fix_broken_links", + "action_type": "typ akcji z listy: generate_schema_org, generate_meta_description, suggest_heading_fix, generate_alt_texts, seo_roadmap, add_analytics, add_sitemap, fix_ssl, add_og_tags, improve_performance, add_local_keywords, add_nap, fix_broken_links, improve_security_headers, optimize_images", "title": "krótki tytuł po polsku", "description": "opis co trzeba zrobić i dlaczego, 1-2 zdania", "priority": "critical/high/medium/low", @@ -453,6 +565,12 @@ def _build_social_prompt(data: dict) -> str: if info.get('last_post_date'): profiles_info += f", ost.post={info.get('last_post_date')}" + # YouTube metrics from API + if platform == 'youtube' and info.get('video_count') is not None: + profiles_info += f", filmy={info.get('video_count')}" + profiles_info += f", wyświetlenia={info.get('view_count', '?')}" + profiles_info += f", subskrybenci={info.get('subscriber_count', '?')}" + # Collect engagement rates for average calculation if info.get('engagement_rate'): engagement_rates.append(info.get('engagement_rate')) diff --git a/crux_service.py b/crux_service.py new file mode 100644 index 0000000..c4a662d --- /dev/null +++ b/crux_service.py @@ -0,0 +1,104 @@ +"""Chrome UX Report (CrUX) API Service. + +Pobiera field data (dane od realnych użytkowników Chrome) dla stron internetowych. +Uzupełnia lab data z PageSpeed Insights o metryki z rzeczywistego ruchu. + +API: https://chromeuxreport.googleapis.com/v1/records:queryRecord +Free tier: 150 requests/minute +""" + +import os +import logging +import requests + +logger = logging.getLogger(__name__) + + +class CrUXService: + """Service for Chrome UX Report API.""" + + BASE_URL = 'https://chromeuxreport.googleapis.com/v1/records:queryRecord' + + def __init__(self, api_key: str = None): + self.api_key = api_key or os.environ.get('GOOGLE_PLACES_API_KEY') + if not self.api_key: + logger.warning("CrUX API key not configured (GOOGLE_PLACES_API_KEY)") + + def get_field_data(self, url: str) -> dict | None: + """Fetch CrUX field data for a URL. + + Args: + url: Website URL (e.g., 'https://example.com') + + Returns: + Dict with field metrics or None if no data available. + Many small/local business sites won't have CrUX data. + """ + if not self.api_key: + return None + + try: + # Try origin-level first (more likely to have data) + response = requests.post( + f"{self.BASE_URL}?key={self.api_key}", + json={'origin': url.rstrip('/')}, + timeout=10 + ) + + if response.status_code == 404: + # No CrUX data for this origin (common for small sites) + return None + + if response.status_code != 200: + logger.warning(f"CrUX API error {response.status_code} for {url}") + return None + + data = response.json() + record = data.get('record', {}) + metrics = record.get('metrics', {}) + + result = {} + + # Extract each metric's p75 value + metric_mapping = { + 'largest_contentful_paint': 'crux_lcp_ms', + 'interaction_to_next_paint': 'crux_inp_ms', + 'cumulative_layout_shift': 'crux_cls', + 'first_contentful_paint': 'crux_fcp_ms', + 'time_to_first_byte': 'crux_ttfb_ms', + } + + for api_name, our_name in metric_mapping.items(): + metric = metrics.get(api_name, {}) + percentiles = metric.get('percentiles', {}) + p75 = percentiles.get('p75') + if p75 is not None: + # CLS is reported as decimal (e.g., 0.15), others in ms + if 'layout_shift' in api_name: + result[our_name] = round(float(p75), 3) + else: + result[our_name] = int(p75) + + # Also extract histogram category distribution + histogram = metric.get('histogram', []) + if histogram and len(histogram) >= 3: + total = sum(h.get('density', 0) for h in histogram) + if total > 0: + good_pct = round(histogram[0].get('density', 0) * 100, 1) + result[f'{our_name}_good_pct'] = good_pct + + # Collection period + collection_period = record.get('collectionPeriod', {}) + if collection_period: + last_date = collection_period.get('lastDate', {}) + if last_date: + result['crux_period_end'] = f"{last_date.get('year')}-{last_date.get('month'):02d}-{last_date.get('day'):02d}" + + return result if result else None + + except requests.exceptions.Timeout: + logger.warning(f"CrUX API timeout for {url}") + return None + except Exception as e: + logger.warning(f"CrUX API error for {url}: {e}") + return None diff --git a/docs/AUDIT_COMPLETENESS_PLAN.md b/docs/AUDIT_COMPLETENESS_PLAN.md index 0ba1021..35cf776 100644 --- a/docs/AUDIT_COMPLETENESS_PLAN.md +++ b/docs/AUDIT_COMPLETENESS_PLAN.md @@ -6,14 +6,15 @@ ## Stan Implementacji -### Faza 0: Quick Wins (1-3 dni, $0) — W TRAKCIE -- [ ] **GBP bugfix:** review_response_rate sprawdza `authorAttribution.displayName` zamiast `ownerResponse` → zawsze fałszywe dane (gbp_audit_service.py) -- [ ] **GBP phantom fields:** has_posts, has_products, has_qa nigdy nie wypełniane → oznaczyć jako "niedostępne bez OAuth" w _build_gbp_prompt() -- [ ] **SEO: FID→INP:** FID deprecated marzec 2024, INP nie zbierany. Dostępny w `loadingExperience.metrics.INTERACTION_TO_NEXT_PAINT` z PageSpeed API -- [ ] **SEO: 10 metryk do promptu:** FCP, TTFB, TBT, Speed Index, load_time_ms, meta title/desc length, schema details, html lang — JUŻ W DB ale nie w prompcie AI -- [ ] **Social: engagement_rate** — pole w DB istnieje, nigdy nie obliczane. Formuła: estimated base_rate × activity_multiplier -- [ ] **Social: posting_frequency_score** — pole w DB, nigdy nie obliczane. 0-10 based on posts_count_30d -- [ ] **Social: enrichment promptu** — dodać last_post_date, page_name, engagement metrics +### Faza 0: Quick Wins (1-3 dni, $0) — UKOŃCZONA (2026-02-08) +- [x] **GBP bugfix:** review_response_rate — naprawiono: sprawdza `ownerResponse` zamiast `authorAttribution.displayName` +- [x] **GBP phantom fields:** has_posts, has_products, has_qa oznaczone jako `[dane niedostępne bez autoryzacji OAuth]` +- [x] **GBP prompt:** dodano review_keywords i description_keywords do promptu AI +- [x] **SEO: FID→INP:** zastąpiono FID przez INP w prompcie i szablonach (progi: 200ms/500ms) +- [x] **SEO: 10 metryk do promptu:** FCP, TTFB, TBT, Speed Index, meta title/desc length, schema details, html lang +- [x] **Social: engagement_rate** — obliczane z industry base_rate × activity_multiplier +- [x] **Social: posting_frequency_score** — 0-10 based on posts_count_30d +- [x] **Social: enrichment promptu** — last_post_date, page_name, engagement metrics, brand consistency **Agenci Phase 0 (team: phase0-quickwins):** - gbp-fixer: Fix review_response_rate + GBP prompt enrichment diff --git a/youtube_service.py b/youtube_service.py new file mode 100644 index 0000000..19aae69 --- /dev/null +++ b/youtube_service.py @@ -0,0 +1,160 @@ +""" +YouTube Data API v3 Service for NordaBiz +========================================= + +Simple YouTube API client for fetching channel statistics. +Uses the YouTube Data API v3 with the same Google API key as Places API. + +API Reference: https://developers.google.com/youtube/v3/docs/channels + +Author: NordaBiz Development Team +Created: 2026-02-08 +""" + +import os +import re +import logging +from typing import Optional, Dict + +import requests + +logger = logging.getLogger(__name__) + +# API Configuration +YOUTUBE_API_BASE = "https://www.googleapis.com/youtube/v3" + + +class YouTubeService: + """Fetches YouTube channel statistics via YouTube Data API v3.""" + + def __init__(self, api_key: str = None): + self.api_key = api_key or os.getenv('GOOGLE_PLACES_API_KEY') + if not self.api_key: + raise ValueError("GOOGLE_PLACES_API_KEY not set in environment") + self.session = requests.Session() + + def extract_channel_id_from_url(self, url: str) -> Optional[str]: + """ + Extract channel ID or handle from YouTube URL. + + Supported formats: + - youtube.com/channel/UC1234567890abcdef + - youtube.com/@handle + - youtube.com/c/channelname + - youtube.com/user/username + + Args: + url: YouTube channel URL + + Returns: + Channel ID (starts with UC) or handle (without @) or None + """ + if not url: + return None + + # Direct channel ID (UC...) + match = re.search(r'youtube\.com/channel/([A-Za-z0-9_-]+)', url) + if match: + return match.group(1) + + # Handle (@username) + match = re.search(r'youtube\.com/@([A-Za-z0-9_-]+)', url) + if match: + return match.group(1) # Return without @ + + # Legacy /c/ and /user/ formats + match = re.search(r'youtube\.com/(?:c|user)/([A-Za-z0-9_-]+)', url) + if match: + return match.group(1) + + logger.warning(f"Unable to extract channel ID from URL: {url}") + return None + + def get_channel_stats(self, channel_id_or_username: str) -> Optional[Dict]: + """ + Fetch channel statistics from YouTube Data API v3. + + Args: + channel_id_or_username: YouTube channel ID (UC...) or username/handle + + Returns: + Dict with channel stats or None on error: + { + 'subscriber_count': int, + 'view_count': int, + 'video_count': int, + 'channel_title': str, + 'channel_description': str + } + """ + if not channel_id_or_username: + return None + + url = f"{YOUTUBE_API_BASE}/channels" + + # Determine if it's a channel ID (starts with UC) or handle/username + if channel_id_or_username.startswith('UC'): + params = { + 'part': 'statistics,snippet', + 'id': channel_id_or_username, + 'key': self.api_key + } + else: + # For handles, we need to use forHandle (modern) or forUsername (legacy) + params = { + 'part': 'statistics,snippet', + 'forHandle': channel_id_or_username, + 'key': self.api_key + } + + try: + response = self.session.get(url, params=params, timeout=15) + response.raise_for_status() + data = response.json() + + items = data.get('items', []) + if not items: + # Try forUsername as fallback + if not channel_id_or_username.startswith('UC'): + params = { + 'part': 'statistics,snippet', + 'forUsername': channel_id_or_username, + 'key': self.api_key + } + response = self.session.get(url, params=params, timeout=15) + response.raise_for_status() + data = response.json() + items = data.get('items', []) + + if not items: + logger.warning(f"No YouTube channel found for: {channel_id_or_username}") + return None + + channel = items[0] + stats = channel.get('statistics', {}) + snippet = channel.get('snippet', {}) + + result = { + 'subscriber_count': int(stats.get('subscriberCount', 0)), + 'view_count': int(stats.get('viewCount', 0)), + 'video_count': int(stats.get('videoCount', 0)), + 'channel_title': snippet.get('title', ''), + 'channel_description': snippet.get('description', '') + } + + logger.info(f"Fetched YouTube stats for {result['channel_title']}: " + f"{result['subscriber_count']} subscribers, " + f"{result['video_count']} videos") + + return result + + except requests.exceptions.HTTPError as e: + logger.warning(f"YouTube API HTTP error for {channel_id_or_username}: " + f"{e.response.status_code} - {e.response.text}") + return None + except requests.exceptions.RequestException as e: + logger.warning(f"YouTube API request error for {channel_id_or_username}: {e}") + return None + except (KeyError, ValueError, TypeError) as e: + logger.warning(f"YouTube API response parse error: {e}") + return None