feat(scripts): add extract_event_from_url.py for visual data extraction
Some checks are pending
NordaBiz Tests / Unit & Integration Tests (push) Waiting to run
NordaBiz Tests / E2E Tests (Playwright) (push) Blocked by required conditions
NordaBiz Tests / Smoke Tests (Production) (push) Blocked by required conditions
NordaBiz Tests / Send Failure Notification (push) Blocked by required conditions

Downloads page text and images from external event URLs so Claude
can visually analyze posters/banners for location, times, and other
details not present in page text (e.g. venue address in graphics).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Maciej Pienczyn 2026-03-19 10:48:07 +01:00
parent 32e5c901c4
commit fe304e895f

View File

@ -0,0 +1,181 @@
#!/usr/bin/env python3
"""
Extract External Event Data from URL
=====================================
Fetches an event page, downloads relevant images (posters, banners),
and saves them locally for visual analysis by Claude.
Usage:
python3 scripts/extract_event_from_url.py <URL>
Output:
- Page text saved to /tmp/event_extract_text.md
- Images saved to /tmp/event_extract_img_*.jpg/png
- Summary printed to stdout
Claude can then read these files (including images) to extract
location, times, and other details that may only appear in graphics.
"""
import sys
import os
import re
import requests
from urllib.parse import urljoin, urlparse
from html.parser import HTMLParser
class ImageExtractor(HTMLParser):
"""Extract image URLs from HTML, filtering for likely event posters/banners."""
def __init__(self, base_url):
super().__init__()
self.base_url = base_url
self.images = []
self._skip_classes = {'logo', 'icon', 'avatar', 'favicon', 'sprite', 'social'}
self._skip_srcs = {'logo', 'icon', 'favicon', 'sprite', 'social', 'flag', 'emoji', 'pixel', 'tracking'}
def handle_starttag(self, tag, attrs):
if tag != 'img':
return
attr_dict = dict(attrs)
src = attr_dict.get('src', '')
if not src:
return
# Skip tiny images (tracking pixels, icons)
width = attr_dict.get('width', '')
height = attr_dict.get('height', '')
if width and width.isdigit() and int(width) < 100:
return
if height and height.isdigit() and int(height) < 100:
return
# Skip by class
css_class = attr_dict.get('class', '').lower()
if any(skip in css_class for skip in self._skip_classes):
return
# Skip by src pattern
src_lower = src.lower()
if any(skip in src_lower for skip in self._skip_srcs):
return
# Make absolute URL
full_url = urljoin(self.base_url, src)
self.images.append({
'url': full_url,
'alt': attr_dict.get('alt', ''),
'class': css_class,
'width': width,
'height': height,
})
def fetch_page(url):
"""Fetch page HTML."""
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
}
resp = requests.get(url, headers=headers, timeout=30)
resp.raise_for_status()
return resp.text
def download_image(url, filepath):
"""Download image to local file."""
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
}
try:
resp = requests.get(url, headers=headers, timeout=15)
resp.raise_for_status()
content_type = resp.headers.get('content-type', '')
if 'image' not in content_type and not url.lower().endswith(('.jpg', '.jpeg', '.png', '.webp', '.gif')):
return False
with open(filepath, 'wb') as f:
f.write(resp.content)
size = os.path.getsize(filepath)
# Skip very small files (< 5KB likely icons) and very large (> 10MB)
if size < 5000 or size > 10_000_000:
os.remove(filepath)
return False
return True
except Exception as e:
print(f" Skip {url}: {e}")
return False
def extract_text_simple(html):
"""Very basic HTML to text conversion."""
# Remove script/style
html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.IGNORECASE)
html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.DOTALL | re.IGNORECASE)
# Replace common block tags with newlines
html = re.sub(r'<(?:p|div|br|h[1-6]|li|tr)[^>]*>', '\n', html, flags=re.IGNORECASE)
# Remove all remaining tags
html = re.sub(r'<[^>]+>', '', html)
# Decode entities
html = html.replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>').replace('&nbsp;', ' ')
# Clean up whitespace
lines = [line.strip() for line in html.split('\n')]
lines = [line for line in lines if line]
return '\n'.join(lines)
def main():
if len(sys.argv) < 2:
print("Usage: python3 scripts/extract_event_from_url.py <URL>")
sys.exit(1)
url = sys.argv[1]
print(f"Fetching: {url}")
# Fetch page
html = fetch_page(url)
# Extract text
text = extract_text_simple(html)
text_path = '/tmp/event_extract_text.md'
with open(text_path, 'w') as f:
f.write(f"# Source: {url}\n\n{text}")
print(f"Text saved: {text_path} ({len(text)} chars)")
# Extract and download images
parser = ImageExtractor(url)
parser.feed(html)
print(f"Found {len(parser.images)} candidate images")
downloaded = []
for i, img in enumerate(parser.images):
ext = os.path.splitext(urlparse(img['url']).path)[1] or '.jpg'
if ext.lower() not in ('.jpg', '.jpeg', '.png', '.webp', '.gif'):
ext = '.jpg'
filepath = f'/tmp/event_extract_img_{i}{ext}'
print(f" Downloading: {img['url'][:80]}...")
if download_image(img['url'], filepath):
size_kb = os.path.getsize(filepath) / 1024
downloaded.append({
'path': filepath,
'url': img['url'],
'alt': img['alt'],
'size_kb': round(size_kb, 1),
})
print(f" Saved: {filepath} ({size_kb:.0f} KB)")
print(f"\nResults:")
print(f" Page text: {text_path}")
print(f" Images downloaded: {len(downloaded)}")
for d in downloaded:
print(f" {d['path']} ({d['size_kb']} KB) — {d['alt'][:60] if d['alt'] else 'no alt'}")
if downloaded:
print(f"\nClaude can now read these images with the Read tool to extract")
print(f"location, times, and other visual-only information.")
if __name__ == '__main__':
main()