feat: YouTube Data API v3 integration for social media enrichment
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions

- YouTubeService now fetches: subscribers, views, video count, description,
  avatar, banner, country, creation date, recent 5 videos
- Enricher uses API first, falls back to scraping
- Extra YouTube data stored in content_types JSONB
- Audit detail shows view count, country, creation date, recent videos
- Requires enabling YouTube Data API v3 in Google Cloud Console

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Maciej Pienczyn 2026-03-12 14:01:15 +01:00
parent af225691b6
commit fe288f0441
4 changed files with 146 additions and 26 deletions

View File

@ -974,7 +974,7 @@ def _run_enrichment_background(company_ids, platforms_filter=None):
profile_result['enriched_data'] = {
k: (str(v) if hasattr(v, 'strftime') else v)
for k, v in enriched.items()
if k in tracked_fields and v is not None
if (k in tracked_fields or k.startswith('_')) and v is not None
}
if changes:
@ -1264,6 +1264,12 @@ def admin_social_audit_enrichment_approve():
enriched = change['enriched_data']
for field, value in enriched.items():
if field.startswith('_') and field.endswith('_extra'):
# Store platform-specific extra data in content_types JSONB
ct = dict(profile.content_types or {})
ct.update(value if isinstance(value, dict) else {})
profile.content_types = ct
continue
if field == 'last_post_date' and isinstance(value, str):
try:
from dateutil.parser import parse as parse_date

View File

@ -1168,36 +1168,69 @@ class SocialProfileEnricher:
return result
def _enrich_youtube(self, url: str) -> Dict[str, Any]:
"""Enrich YouTube channel data."""
"""Enrich YouTube channel data via YouTube Data API v3.
Falls back to scraping if API key is not available.
"""
result = {}
try:
# Try YouTube Data API v3 first
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from youtube_service import YouTubeService
yt = YouTubeService()
channel_id = yt.extract_channel_id_from_url(url)
if channel_id:
stats = yt.get_channel_stats(channel_id)
if stats:
result['followers_count'] = stats['subscriber_count']
result['posts_count_365d'] = stats['video_count']
result['page_name'] = stats['channel_title']
if stats.get('channel_description'):
result['profile_description'] = stats['channel_description'][:500]
result['has_bio'] = True
result['has_profile_photo'] = bool(stats.get('thumbnail_url'))
result['has_cover_photo'] = bool(stats.get('banner_url'))
# Store extra data in a special key for content_types JSONB
result['_youtube_extra'] = {
'view_count': stats.get('view_count', 0),
'country': stats.get('country', ''),
'published_at': stats.get('published_at', '')[:10] if stats.get('published_at') else '',
'custom_url': stats.get('custom_url', ''),
'thumbnail_url': stats.get('thumbnail_url', ''),
'banner_url': stats.get('banner_url', ''),
}
# Fetch recent videos (best effort)
actual_channel_id = stats.get('channel_id', channel_id)
if actual_channel_id.startswith('UC'):
videos = yt.get_recent_videos(actual_channel_id, 5)
if videos:
result['_youtube_extra']['recent_videos'] = videos
# Last post date from most recent video
if videos[0].get('date'):
from datetime import datetime
try:
result['last_post_date'] = datetime.strptime(videos[0]['date'], '%Y-%m-%d')
except ValueError:
pass
return result
except (ImportError, ValueError) as e:
logger.debug(f"YouTube API not available ({e}), falling back to scraping")
except Exception as e:
logger.debug(f"YouTube API enrichment failed: {e}")
# Fallback: scraping (usually returns nothing due to JS rendering)
try:
resp = self.session.get(url, timeout=REQUEST_TIMEOUT)
if resp.status_code == 200:
html = resp.text
# Subscriber count from meta or JSON
subs_match = re.search(r'"subscriberCountText":\s*\{"simpleText":\s*"([^"]+)"\}', html)
if subs_match:
result['followers_count'] = self._parse_count(subs_match.group(1).split(' ')[0])
# Video count
videos_match = re.search(r'"videosCountText":\s*\{"runs":\s*\[\{"text":\s*"([^"]+)"\}', html)
if videos_match:
result['posts_count_365d'] = self._parse_count(videos_match.group(1))
# Channel description
desc_match = re.search(r'"description":\s*"([^"]*(?:\\.[^"]*)*)"', html)
if desc_match:
desc = desc_match.group(1).replace('\\n', ' ').strip()
if desc and len(desc) > 5:
result['profile_description'] = desc[:500]
result['has_bio'] = True
# Avatar from og:image
og_img = re.search(r'<meta\s+(?:property|name)="og:image"\s+content="([^"]+)"', html)
result['has_profile_photo'] = bool(og_img)
# Channel name
name_match = re.search(r'<meta\s+(?:property|name)="og:title"\s+content="([^"]+)"', html)
if name_match:
result['page_name'] = name_match.group(1)
except Exception as e:
logger.debug(f"YouTube enrichment failed: {e}")
logger.debug(f"YouTube scraping failed: {e}")
return result
def _enrich_linkedin(self, url: str) -> Dict[str, Any]:

View File

@ -832,6 +832,38 @@
{% endfor %}
</div>
{% endif %}
<!-- YouTube extra data -->
{% if ct.get('view_count') or ct.get('recent_videos') %}
<div style="margin-top: var(--spacing-sm);">
{% set yt_info = [] %}
{% if ct.get('view_count') %}{% if yt_info.append(('👁️', '{:,}'.format(ct.view_count).replace(',', ' ') ~ ' wyświetleń łącznie')) %}{% endif %}{% endif %}
{% if ct.get('country') %}{% if yt_info.append(('🌍', ct.country)) %}{% endif %}{% endif %}
{% if ct.get('published_at') %}{% if yt_info.append(('📅', 'Kanał od: ' ~ ct.published_at)) %}{% endif %}{% endif %}
{% if ct.get('custom_url') %}{% if yt_info.append(('🔗', ct.custom_url)) %}{% endif %}{% endif %}
{% if yt_info %}
<div style="display: flex; gap: var(--spacing-xs); flex-wrap: wrap; margin-bottom: var(--spacing-sm);">
{% for icon, val in yt_info %}
<span style="background: var(--background); padding: 3px 10px; border-radius: var(--radius); font-size: 12px; color: var(--text-secondary); border: 1px solid var(--border-color, #e5e7eb);">
{{ icon }} {{ val }}
</span>
{% endfor %}
</div>
{% endif %}
{% if ct.get('recent_videos') %}
<div style="font-size: 11px; font-weight: 600; color: var(--text-secondary); margin-bottom: 4px;">Ostatnie filmy</div>
{% for vid in ct.recent_videos %}
<div style="padding: 4px 0; border-bottom: 1px solid #f3f4f6; font-size: 12px; display: flex; gap: var(--spacing-sm); align-items: baseline;">
<span style="color: var(--text-secondary); white-space: nowrap;">{{ vid.date }}</span>
<span style="flex: 1; overflow: hidden; text-overflow: ellipsis; white-space: nowrap;">{{ vid.title or '(bez tytułu)' }}</span>
{% if vid.video_id %}
<a href="https://youtube.com/watch?v={{ vid.video_id }}" target="_blank" rel="noopener" style="font-size: 11px; color: #dc2626; white-space: nowrap;">▶ Oglądaj</a>
{% endif %}
</div>
{% endfor %}
{% endif %}
</div>
{% endif %}
{% endif %}
<!-- Data Provenance Section -->

View File

@ -28,9 +28,9 @@ class YouTubeService:
"""Fetches YouTube channel statistics via YouTube Data API v3."""
def __init__(self, api_key: str = None):
self.api_key = api_key or os.getenv('GOOGLE_PLACES_API_KEY')
self.api_key = api_key or os.getenv('YOUTUBE_API_KEY') or os.getenv('GOOGLE_PAGESPEED_API_KEY') or os.getenv('GOOGLE_PLACES_API_KEY')
if not self.api_key:
raise ValueError("GOOGLE_PLACES_API_KEY not set in environment")
raise ValueError("No Google API key found (YOUTUBE_API_KEY, GOOGLE_PAGESPEED_API_KEY, or GOOGLE_PLACES_API_KEY)")
self.session = requests.Session()
def extract_channel_id_from_url(self, url: str) -> Optional[str]:
@ -95,14 +95,14 @@ class YouTubeService:
# Determine if it's a channel ID (starts with UC) or handle/username
if channel_id_or_username.startswith('UC'):
params = {
'part': 'statistics,snippet',
'part': 'statistics,snippet,brandingSettings',
'id': channel_id_or_username,
'key': self.api_key
}
else:
# For handles, we need to use forHandle (modern) or forUsername (legacy)
params = {
'part': 'statistics,snippet',
'part': 'statistics,snippet,brandingSettings',
'forHandle': channel_id_or_username,
'key': self.api_key
}
@ -117,7 +117,7 @@ class YouTubeService:
# Try forUsername as fallback
if not channel_id_or_username.startswith('UC'):
params = {
'part': 'statistics,snippet',
'part': 'statistics,snippet,brandingSettings',
'forUsername': channel_id_or_username,
'key': self.api_key
}
@ -131,15 +131,26 @@ class YouTubeService:
return None
channel = items[0]
channel_id = channel.get('id', channel_id_or_username)
stats = channel.get('statistics', {})
snippet = channel.get('snippet', {})
branding = channel.get('brandingSettings', {}).get('channel', {})
thumbnails = snippet.get('thumbnails', {})
result = {
'channel_id': channel_id,
'subscriber_count': int(stats.get('subscriberCount', 0)),
'hidden_subscriber_count': stats.get('hiddenSubscriberCount', False),
'view_count': int(stats.get('viewCount', 0)),
'video_count': int(stats.get('videoCount', 0)),
'channel_title': snippet.get('title', ''),
'channel_description': snippet.get('description', '')
'channel_description': snippet.get('description', ''),
'custom_url': snippet.get('customUrl', ''),
'published_at': snippet.get('publishedAt', ''),
'country': snippet.get('country', ''),
'thumbnail_url': thumbnails.get('high', thumbnails.get('default', {})).get('url', ''),
'banner_url': channel.get('brandingSettings', {}).get('image', {}).get('bannerExternalUrl', ''),
'keywords': branding.get('keywords', ''),
}
logger.info(f"Fetched YouTube stats for {result['channel_title']}: "
@ -158,3 +169,41 @@ class YouTubeService:
except (KeyError, ValueError, TypeError) as e:
logger.warning(f"YouTube API response parse error: {e}")
return None
def get_recent_videos(self, channel_id: str, max_results: int = 5) -> list:
"""Fetch recent videos from a channel (costs 100 quota units).
Args:
channel_id: YouTube channel ID (UC...)
max_results: Number of videos to fetch (max 50)
Returns:
List of dicts with video info, or empty list on error.
"""
if not channel_id or not channel_id.startswith('UC'):
return []
try:
resp = self.session.get(f"{YOUTUBE_API_BASE}/search", params={
'part': 'snippet',
'channelId': channel_id,
'order': 'date',
'type': 'video',
'maxResults': max_results,
'key': self.api_key,
}, timeout=15)
resp.raise_for_status()
items = resp.json().get('items', [])
videos = []
for item in items:
snippet = item.get('snippet', {})
videos.append({
'title': snippet.get('title', ''),
'date': snippet.get('publishedAt', '')[:10],
'video_id': item.get('id', {}).get('videoId', ''),
})
return videos
except Exception as e:
logger.debug(f"YouTube recent videos fetch failed: {e}")
return []