feat: YouTube Data API v3 integration for social media enrichment
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions
- YouTubeService now fetches: subscribers, views, video count, description, avatar, banner, country, creation date, recent 5 videos - Enricher uses API first, falls back to scraping - Extra YouTube data stored in content_types JSONB - Audit detail shows view count, country, creation date, recent videos - Requires enabling YouTube Data API v3 in Google Cloud Console Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
af225691b6
commit
fe288f0441
@ -974,7 +974,7 @@ def _run_enrichment_background(company_ids, platforms_filter=None):
|
||||
profile_result['enriched_data'] = {
|
||||
k: (str(v) if hasattr(v, 'strftime') else v)
|
||||
for k, v in enriched.items()
|
||||
if k in tracked_fields and v is not None
|
||||
if (k in tracked_fields or k.startswith('_')) and v is not None
|
||||
}
|
||||
|
||||
if changes:
|
||||
@ -1264,6 +1264,12 @@ def admin_social_audit_enrichment_approve():
|
||||
|
||||
enriched = change['enriched_data']
|
||||
for field, value in enriched.items():
|
||||
if field.startswith('_') and field.endswith('_extra'):
|
||||
# Store platform-specific extra data in content_types JSONB
|
||||
ct = dict(profile.content_types or {})
|
||||
ct.update(value if isinstance(value, dict) else {})
|
||||
profile.content_types = ct
|
||||
continue
|
||||
if field == 'last_post_date' and isinstance(value, str):
|
||||
try:
|
||||
from dateutil.parser import parse as parse_date
|
||||
|
||||
@ -1168,36 +1168,69 @@ class SocialProfileEnricher:
|
||||
return result
|
||||
|
||||
def _enrich_youtube(self, url: str) -> Dict[str, Any]:
|
||||
"""Enrich YouTube channel data."""
|
||||
"""Enrich YouTube channel data via YouTube Data API v3.
|
||||
Falls back to scraping if API key is not available.
|
||||
"""
|
||||
result = {}
|
||||
try:
|
||||
# Try YouTube Data API v3 first
|
||||
import sys
|
||||
from pathlib import Path
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
||||
from youtube_service import YouTubeService
|
||||
yt = YouTubeService()
|
||||
channel_id = yt.extract_channel_id_from_url(url)
|
||||
if channel_id:
|
||||
stats = yt.get_channel_stats(channel_id)
|
||||
if stats:
|
||||
result['followers_count'] = stats['subscriber_count']
|
||||
result['posts_count_365d'] = stats['video_count']
|
||||
result['page_name'] = stats['channel_title']
|
||||
if stats.get('channel_description'):
|
||||
result['profile_description'] = stats['channel_description'][:500]
|
||||
result['has_bio'] = True
|
||||
result['has_profile_photo'] = bool(stats.get('thumbnail_url'))
|
||||
result['has_cover_photo'] = bool(stats.get('banner_url'))
|
||||
# Store extra data in a special key for content_types JSONB
|
||||
result['_youtube_extra'] = {
|
||||
'view_count': stats.get('view_count', 0),
|
||||
'country': stats.get('country', ''),
|
||||
'published_at': stats.get('published_at', '')[:10] if stats.get('published_at') else '',
|
||||
'custom_url': stats.get('custom_url', ''),
|
||||
'thumbnail_url': stats.get('thumbnail_url', ''),
|
||||
'banner_url': stats.get('banner_url', ''),
|
||||
}
|
||||
# Fetch recent videos (best effort)
|
||||
actual_channel_id = stats.get('channel_id', channel_id)
|
||||
if actual_channel_id.startswith('UC'):
|
||||
videos = yt.get_recent_videos(actual_channel_id, 5)
|
||||
if videos:
|
||||
result['_youtube_extra']['recent_videos'] = videos
|
||||
# Last post date from most recent video
|
||||
if videos[0].get('date'):
|
||||
from datetime import datetime
|
||||
try:
|
||||
result['last_post_date'] = datetime.strptime(videos[0]['date'], '%Y-%m-%d')
|
||||
except ValueError:
|
||||
pass
|
||||
return result
|
||||
except (ImportError, ValueError) as e:
|
||||
logger.debug(f"YouTube API not available ({e}), falling back to scraping")
|
||||
except Exception as e:
|
||||
logger.debug(f"YouTube API enrichment failed: {e}")
|
||||
|
||||
# Fallback: scraping (usually returns nothing due to JS rendering)
|
||||
try:
|
||||
resp = self.session.get(url, timeout=REQUEST_TIMEOUT)
|
||||
if resp.status_code == 200:
|
||||
html = resp.text
|
||||
# Subscriber count from meta or JSON
|
||||
subs_match = re.search(r'"subscriberCountText":\s*\{"simpleText":\s*"([^"]+)"\}', html)
|
||||
if subs_match:
|
||||
result['followers_count'] = self._parse_count(subs_match.group(1).split(' ')[0])
|
||||
# Video count
|
||||
videos_match = re.search(r'"videosCountText":\s*\{"runs":\s*\[\{"text":\s*"([^"]+)"\}', html)
|
||||
if videos_match:
|
||||
result['posts_count_365d'] = self._parse_count(videos_match.group(1))
|
||||
# Channel description
|
||||
desc_match = re.search(r'"description":\s*"([^"]*(?:\\.[^"]*)*)"', html)
|
||||
if desc_match:
|
||||
desc = desc_match.group(1).replace('\\n', ' ').strip()
|
||||
if desc and len(desc) > 5:
|
||||
result['profile_description'] = desc[:500]
|
||||
result['has_bio'] = True
|
||||
# Avatar from og:image
|
||||
og_img = re.search(r'<meta\s+(?:property|name)="og:image"\s+content="([^"]+)"', html)
|
||||
result['has_profile_photo'] = bool(og_img)
|
||||
# Channel name
|
||||
name_match = re.search(r'<meta\s+(?:property|name)="og:title"\s+content="([^"]+)"', html)
|
||||
if name_match:
|
||||
result['page_name'] = name_match.group(1)
|
||||
except Exception as e:
|
||||
logger.debug(f"YouTube enrichment failed: {e}")
|
||||
logger.debug(f"YouTube scraping failed: {e}")
|
||||
return result
|
||||
|
||||
def _enrich_linkedin(self, url: str) -> Dict[str, Any]:
|
||||
|
||||
@ -832,6 +832,38 @@
|
||||
{% endfor %}
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
<!-- YouTube extra data -->
|
||||
{% if ct.get('view_count') or ct.get('recent_videos') %}
|
||||
<div style="margin-top: var(--spacing-sm);">
|
||||
{% set yt_info = [] %}
|
||||
{% if ct.get('view_count') %}{% if yt_info.append(('👁️', '{:,}'.format(ct.view_count).replace(',', ' ') ~ ' wyświetleń łącznie')) %}{% endif %}{% endif %}
|
||||
{% if ct.get('country') %}{% if yt_info.append(('🌍', ct.country)) %}{% endif %}{% endif %}
|
||||
{% if ct.get('published_at') %}{% if yt_info.append(('📅', 'Kanał od: ' ~ ct.published_at)) %}{% endif %}{% endif %}
|
||||
{% if ct.get('custom_url') %}{% if yt_info.append(('🔗', ct.custom_url)) %}{% endif %}{% endif %}
|
||||
{% if yt_info %}
|
||||
<div style="display: flex; gap: var(--spacing-xs); flex-wrap: wrap; margin-bottom: var(--spacing-sm);">
|
||||
{% for icon, val in yt_info %}
|
||||
<span style="background: var(--background); padding: 3px 10px; border-radius: var(--radius); font-size: 12px; color: var(--text-secondary); border: 1px solid var(--border-color, #e5e7eb);">
|
||||
{{ icon }} {{ val }}
|
||||
</span>
|
||||
{% endfor %}
|
||||
</div>
|
||||
{% endif %}
|
||||
{% if ct.get('recent_videos') %}
|
||||
<div style="font-size: 11px; font-weight: 600; color: var(--text-secondary); margin-bottom: 4px;">Ostatnie filmy</div>
|
||||
{% for vid in ct.recent_videos %}
|
||||
<div style="padding: 4px 0; border-bottom: 1px solid #f3f4f6; font-size: 12px; display: flex; gap: var(--spacing-sm); align-items: baseline;">
|
||||
<span style="color: var(--text-secondary); white-space: nowrap;">{{ vid.date }}</span>
|
||||
<span style="flex: 1; overflow: hidden; text-overflow: ellipsis; white-space: nowrap;">{{ vid.title or '(bez tytułu)' }}</span>
|
||||
{% if vid.video_id %}
|
||||
<a href="https://youtube.com/watch?v={{ vid.video_id }}" target="_blank" rel="noopener" style="font-size: 11px; color: #dc2626; white-space: nowrap;">▶ Oglądaj</a>
|
||||
{% endif %}
|
||||
</div>
|
||||
{% endfor %}
|
||||
{% endif %}
|
||||
</div>
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
|
||||
<!-- Data Provenance Section -->
|
||||
|
||||
@ -28,9 +28,9 @@ class YouTubeService:
|
||||
"""Fetches YouTube channel statistics via YouTube Data API v3."""
|
||||
|
||||
def __init__(self, api_key: str = None):
|
||||
self.api_key = api_key or os.getenv('GOOGLE_PLACES_API_KEY')
|
||||
self.api_key = api_key or os.getenv('YOUTUBE_API_KEY') or os.getenv('GOOGLE_PAGESPEED_API_KEY') or os.getenv('GOOGLE_PLACES_API_KEY')
|
||||
if not self.api_key:
|
||||
raise ValueError("GOOGLE_PLACES_API_KEY not set in environment")
|
||||
raise ValueError("No Google API key found (YOUTUBE_API_KEY, GOOGLE_PAGESPEED_API_KEY, or GOOGLE_PLACES_API_KEY)")
|
||||
self.session = requests.Session()
|
||||
|
||||
def extract_channel_id_from_url(self, url: str) -> Optional[str]:
|
||||
@ -95,14 +95,14 @@ class YouTubeService:
|
||||
# Determine if it's a channel ID (starts with UC) or handle/username
|
||||
if channel_id_or_username.startswith('UC'):
|
||||
params = {
|
||||
'part': 'statistics,snippet',
|
||||
'part': 'statistics,snippet,brandingSettings',
|
||||
'id': channel_id_or_username,
|
||||
'key': self.api_key
|
||||
}
|
||||
else:
|
||||
# For handles, we need to use forHandle (modern) or forUsername (legacy)
|
||||
params = {
|
||||
'part': 'statistics,snippet',
|
||||
'part': 'statistics,snippet,brandingSettings',
|
||||
'forHandle': channel_id_or_username,
|
||||
'key': self.api_key
|
||||
}
|
||||
@ -117,7 +117,7 @@ class YouTubeService:
|
||||
# Try forUsername as fallback
|
||||
if not channel_id_or_username.startswith('UC'):
|
||||
params = {
|
||||
'part': 'statistics,snippet',
|
||||
'part': 'statistics,snippet,brandingSettings',
|
||||
'forUsername': channel_id_or_username,
|
||||
'key': self.api_key
|
||||
}
|
||||
@ -131,15 +131,26 @@ class YouTubeService:
|
||||
return None
|
||||
|
||||
channel = items[0]
|
||||
channel_id = channel.get('id', channel_id_or_username)
|
||||
stats = channel.get('statistics', {})
|
||||
snippet = channel.get('snippet', {})
|
||||
branding = channel.get('brandingSettings', {}).get('channel', {})
|
||||
thumbnails = snippet.get('thumbnails', {})
|
||||
|
||||
result = {
|
||||
'channel_id': channel_id,
|
||||
'subscriber_count': int(stats.get('subscriberCount', 0)),
|
||||
'hidden_subscriber_count': stats.get('hiddenSubscriberCount', False),
|
||||
'view_count': int(stats.get('viewCount', 0)),
|
||||
'video_count': int(stats.get('videoCount', 0)),
|
||||
'channel_title': snippet.get('title', ''),
|
||||
'channel_description': snippet.get('description', '')
|
||||
'channel_description': snippet.get('description', ''),
|
||||
'custom_url': snippet.get('customUrl', ''),
|
||||
'published_at': snippet.get('publishedAt', ''),
|
||||
'country': snippet.get('country', ''),
|
||||
'thumbnail_url': thumbnails.get('high', thumbnails.get('default', {})).get('url', ''),
|
||||
'banner_url': channel.get('brandingSettings', {}).get('image', {}).get('bannerExternalUrl', ''),
|
||||
'keywords': branding.get('keywords', ''),
|
||||
}
|
||||
|
||||
logger.info(f"Fetched YouTube stats for {result['channel_title']}: "
|
||||
@ -158,3 +169,41 @@ class YouTubeService:
|
||||
except (KeyError, ValueError, TypeError) as e:
|
||||
logger.warning(f"YouTube API response parse error: {e}")
|
||||
return None
|
||||
|
||||
def get_recent_videos(self, channel_id: str, max_results: int = 5) -> list:
|
||||
"""Fetch recent videos from a channel (costs 100 quota units).
|
||||
|
||||
Args:
|
||||
channel_id: YouTube channel ID (UC...)
|
||||
max_results: Number of videos to fetch (max 50)
|
||||
|
||||
Returns:
|
||||
List of dicts with video info, or empty list on error.
|
||||
"""
|
||||
if not channel_id or not channel_id.startswith('UC'):
|
||||
return []
|
||||
|
||||
try:
|
||||
resp = self.session.get(f"{YOUTUBE_API_BASE}/search", params={
|
||||
'part': 'snippet',
|
||||
'channelId': channel_id,
|
||||
'order': 'date',
|
||||
'type': 'video',
|
||||
'maxResults': max_results,
|
||||
'key': self.api_key,
|
||||
}, timeout=15)
|
||||
resp.raise_for_status()
|
||||
items = resp.json().get('items', [])
|
||||
|
||||
videos = []
|
||||
for item in items:
|
||||
snippet = item.get('snippet', {})
|
||||
videos.append({
|
||||
'title': snippet.get('title', ''),
|
||||
'date': snippet.get('publishedAt', '')[:10],
|
||||
'video_id': item.get('id', {}).get('videoId', ''),
|
||||
})
|
||||
return videos
|
||||
except Exception as e:
|
||||
logger.debug(f"YouTube recent videos fetch failed: {e}")
|
||||
return []
|
||||
|
||||
Loading…
Reference in New Issue
Block a user