feat(scripts): add extract_event_from_url.py for visual data extraction

Downloads page text and images from external event URLs so Claude can visually analyze posters/banners for location, times, and other details not present in page text (e.g. venue address in graphics). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-19 10:48:07 +01:00 · 2026-03-19 10:48:07 +01:00 · fe304e895f
commit fe304e895f
parent 32e5c901c4
1 changed files with 181 additions and 0 deletions
--- a/scripts/extract_event_from_url.py
+++ b/scripts/extract_event_from_url.py
@ -0,0 +1,181 @@
+#!/usr/bin/env python3
+"""
+Extract External Event Data from URL
+=====================================
+
+Fetches an event page, downloads relevant images (posters, banners),
+and saves them locally for visual analysis by Claude.
+
+Usage:
+    python3 scripts/extract_event_from_url.py <URL>
+
+Output:
+    - Page text saved to /tmp/event_extract_text.md
+    - Images saved to /tmp/event_extract_img_*.jpg/png
+    - Summary printed to stdout
+
+Claude can then read these files (including images) to extract
+location, times, and other details that may only appear in graphics.
+"""
+
+import sys
+import os
+import re
+import requests
+from urllib.parse import urljoin, urlparse
+from html.parser import HTMLParser
+
+
+class ImageExtractor(HTMLParser):
+    """Extract image URLs from HTML, filtering for likely event posters/banners."""
+
+    def __init__(self, base_url):
+        super().__init__()
+        self.base_url = base_url
+        self.images = []
+        self._skip_classes = {'logo', 'icon', 'avatar', 'favicon', 'sprite', 'social'}
+        self._skip_srcs = {'logo', 'icon', 'favicon', 'sprite', 'social', 'flag', 'emoji', 'pixel', 'tracking'}
+
+    def handle_starttag(self, tag, attrs):
+        if tag != 'img':
+            return
+
+        attr_dict = dict(attrs)
+        src = attr_dict.get('src', '')
+        if not src:
+            return
+
+        # Skip tiny images (tracking pixels, icons)
+        width = attr_dict.get('width', '')
+        height = attr_dict.get('height', '')
+        if width and width.isdigit() and int(width) < 100:
+            return
+        if height and height.isdigit() and int(height) < 100:
+            return
+
+        # Skip by class
+        css_class = attr_dict.get('class', '').lower()
+        if any(skip in css_class for skip in self._skip_classes):
+            return
+
+        # Skip by src pattern
+        src_lower = src.lower()
+        if any(skip in src_lower for skip in self._skip_srcs):
+            return
+
+        # Make absolute URL
+        full_url = urljoin(self.base_url, src)
+        self.images.append({
+            'url': full_url,
+            'alt': attr_dict.get('alt', ''),
+            'class': css_class,
+            'width': width,
+            'height': height,
+        })
+
+
+def fetch_page(url):
+    """Fetch page HTML."""
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
+    }
+    resp = requests.get(url, headers=headers, timeout=30)
+    resp.raise_for_status()
+    return resp.text
+
+
+def download_image(url, filepath):
+    """Download image to local file."""
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
+    }
+    try:
+        resp = requests.get(url, headers=headers, timeout=15)
+        resp.raise_for_status()
+        content_type = resp.headers.get('content-type', '')
+        if 'image' not in content_type and not url.lower().endswith(('.jpg', '.jpeg', '.png', '.webp', '.gif')):
+            return False
+        with open(filepath, 'wb') as f:
+            f.write(resp.content)
+        size = os.path.getsize(filepath)
+        # Skip very small files (< 5KB likely icons) and very large (> 10MB)
+        if size < 5000 or size > 10_000_000:
+            os.remove(filepath)
+            return False
+        return True
+    except Exception as e:
+        print(f"  Skip {url}: {e}")
+        return False
+
+
+def extract_text_simple(html):
+    """Very basic HTML to text conversion."""
+    # Remove script/style
+    html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.IGNORECASE)
+    html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.DOTALL | re.IGNORECASE)
+    # Replace common block tags with newlines
+    html = re.sub(r'<(?:p|div|br|h[1-6]|li|tr)[^>]*>', '\n', html, flags=re.IGNORECASE)
+    # Remove all remaining tags
+    html = re.sub(r'<[^>]+>', '', html)
+    # Decode entities
+    html = html.replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>').replace('&nbsp;', ' ')
+    # Clean up whitespace
+    lines = [line.strip() for line in html.split('\n')]
+    lines = [line for line in lines if line]
+    return '\n'.join(lines)
+
+
+def main():
+    if len(sys.argv) < 2:
+        print("Usage: python3 scripts/extract_event_from_url.py <URL>")
+        sys.exit(1)
+
+    url = sys.argv[1]
+    print(f"Fetching: {url}")
+
+    # Fetch page
+    html = fetch_page(url)
+
+    # Extract text
+    text = extract_text_simple(html)
+    text_path = '/tmp/event_extract_text.md'
+    with open(text_path, 'w') as f:
+        f.write(f"# Source: {url}\n\n{text}")
+    print(f"Text saved: {text_path} ({len(text)} chars)")
+
+    # Extract and download images
+    parser = ImageExtractor(url)
+    parser.feed(html)
+
+    print(f"Found {len(parser.images)} candidate images")
+
+    downloaded = []
+    for i, img in enumerate(parser.images):
+        ext = os.path.splitext(urlparse(img['url']).path)[1] or '.jpg'
+        if ext.lower() not in ('.jpg', '.jpeg', '.png', '.webp', '.gif'):
+            ext = '.jpg'
+        filepath = f'/tmp/event_extract_img_{i}{ext}'
+        print(f"  Downloading: {img['url'][:80]}...")
+        if download_image(img['url'], filepath):
+            size_kb = os.path.getsize(filepath) / 1024
+            downloaded.append({
+                'path': filepath,
+                'url': img['url'],
+                'alt': img['alt'],
+                'size_kb': round(size_kb, 1),
+            })
+            print(f"    Saved: {filepath} ({size_kb:.0f} KB)")
+
+    print(f"\nResults:")
+    print(f"  Page text: {text_path}")
+    print(f"  Images downloaded: {len(downloaded)}")
+    for d in downloaded:
+        print(f"    {d['path']} ({d['size_kb']} KB) — {d['alt'][:60] if d['alt'] else 'no alt'}")
+
+    if downloaded:
+        print(f"\nClaude can now read these images with the Read tool to extract")
+        print(f"location, times, and other visual-only information.")
+
+
+if __name__ == '__main__':
+    main()