fix: Naprawiono dekodowanie URL-i Google News
Zmieniono kolejność metod dekodowania - googlenewsdecoder jest teraz używany jako pierwsza metoda zamiast ostatniej. Poprzednia kolejność powodowała wpadanie w pętlę z consent.google.com i wyczerpanie max_depth przed wywołaniem działającej biblioteki. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
900a3b4ed9
commit
081c0d7ec5
@ -176,10 +176,8 @@ def decode_google_news_url(google_url: str, max_depth: int = 3) -> Optional[str]
|
||||
"""
|
||||
Decode Google News URL to original source URL.
|
||||
|
||||
Google News uses different formats:
|
||||
1. /rss/articles/CBMi... - Base64 encoded
|
||||
2. /articles/CBMi... - Base64 encoded
|
||||
3. Redirects through consent.google.com
|
||||
Google News uses Protocol Buffer encoding (not simple Base64).
|
||||
The googlenewsdecoder library handles this correctly.
|
||||
|
||||
Args:
|
||||
google_url: URL to decode
|
||||
@ -191,7 +189,14 @@ def decode_google_news_url(google_url: str, max_depth: int = 3) -> Optional[str]
|
||||
if max_depth <= 0:
|
||||
return None
|
||||
|
||||
# Method 1: Decode Base64 from URL (preferred - no HTTP request)
|
||||
# Method 1: Use googlenewsdecoder library (PREFERRED - handles Protocol Buffer encoding)
|
||||
# This is the most reliable method for modern Google News URLs
|
||||
decoded = decode_google_news_url_with_library(google_url)
|
||||
if decoded:
|
||||
logger.debug(f"googlenewsdecoder succeeded: {decoded[:80]}...")
|
||||
return decoded
|
||||
|
||||
# Method 2: Try Base64 decode (fallback for older URL formats)
|
||||
try:
|
||||
# Find encoded part (supports both /articles/ and /rss/articles/)
|
||||
match = re.search(r'/(?:rss/)?articles/([A-Za-z0-9_-]+)', google_url)
|
||||
@ -205,10 +210,10 @@ def decode_google_news_url(google_url: str, max_depth: int = 3) -> Optional[str]
|
||||
|
||||
# Decode
|
||||
try:
|
||||
decoded = base64.urlsafe_b64decode(encoded)
|
||||
decoded_bytes = base64.urlsafe_b64decode(encoded)
|
||||
|
||||
# Find URLs in decoded data
|
||||
urls = re.findall(rb'https?://[^\x00-\x1f\s"\'<>]+', decoded)
|
||||
urls = re.findall(rb'https?://[^\x00-\x1f\s"\'<>]+', decoded_bytes)
|
||||
|
||||
for url in urls:
|
||||
try:
|
||||
@ -220,6 +225,7 @@ def decode_google_news_url(google_url: str, max_depth: int = 3) -> Optional[str]
|
||||
url_str = url_str.split('\r')[0]
|
||||
url_str = url_str.split('\n')[0]
|
||||
if url_str.startswith('http'):
|
||||
logger.debug(f"Base64 decode succeeded: {url_str[:80]}...")
|
||||
return url_str
|
||||
except:
|
||||
continue
|
||||
@ -228,41 +234,31 @@ def decode_google_news_url(google_url: str, max_depth: int = 3) -> Optional[str]
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Method 2: Follow redirects (only if Base64 didn't work)
|
||||
# NOTE: This method makes an HTTP request
|
||||
try:
|
||||
response = requests.get(
|
||||
google_url,
|
||||
headers=GOOGLE_NEWS_HEADERS,
|
||||
timeout=15,
|
||||
allow_redirects=True
|
||||
)
|
||||
final_url = response.url
|
||||
response.close()
|
||||
# Method 3: Follow redirects (last resort - often fails due to consent.google.com)
|
||||
# Only try this if we haven't exhausted max_depth significantly
|
||||
if max_depth >= 2:
|
||||
try:
|
||||
response = requests.get(
|
||||
google_url,
|
||||
headers=GOOGLE_NEWS_HEADERS,
|
||||
timeout=10,
|
||||
allow_redirects=True
|
||||
)
|
||||
final_url = response.url
|
||||
response.close()
|
||||
|
||||
# If we landed on consent.google.com, extract URL from parameters
|
||||
if 'consent.google.com' in final_url:
|
||||
parsed = urlparse(final_url)
|
||||
params = parse_qs(parsed.query)
|
||||
if 'continue' in params:
|
||||
continue_url = unquote(params['continue'][0])
|
||||
# Iteratively decode (not recursively!)
|
||||
if 'news.google.com' in continue_url:
|
||||
return decode_google_news_url(continue_url, max_depth - 1)
|
||||
return continue_url
|
||||
# If it's not Google, we have the original URL
|
||||
if 'google.com' not in final_url:
|
||||
logger.debug(f"Redirect follow succeeded: {final_url[:80]}...")
|
||||
return final_url
|
||||
|
||||
# If it's not Google, we have the original URL
|
||||
if 'google.com' not in final_url:
|
||||
return final_url
|
||||
# If we landed on consent.google.com, don't recurse - it doesn't help
|
||||
# The consent page doesn't redirect to the actual article
|
||||
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Method 3: Use googlenewsdecoder library (handles Protocol Buffer encoding)
|
||||
decoded = decode_google_news_url_with_library(google_url)
|
||||
if decoded:
|
||||
return decoded
|
||||
except Exception as e:
|
||||
logger.debug(f"Redirect follow failed: {e}")
|
||||
|
||||
logger.warning(f"All Google News URL decoding methods failed for: {google_url[:80]}...")
|
||||
return None
|
||||
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user