nordabiz/tests/unit/test_link_preview.py

"""
Unit Tests — Link Preview
==========================

Tests for blueprints/messages/link_preview.py:
- OGParser with og: meta tags
- OGParser fallback to <title> and meta description
- OGParser with no meta tags
- fetch_link_preview with no URL
- fetch_link_preview skips internal URLs
- fetch_link_preview success (mocked HTTP)
- fetch_link_preview timeout handling
- fetch_link_preview non-HTML content-type
- URL extraction from HTML anchor tags
"""

import os
import sys

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))

import pytest
from unittest.mock import patch, MagicMock
from requests.exceptions import Timeout

from blueprints.messages.link_preview import fetch_link_preview, OGParser


# ============================================================
# OGParser Tests
# ============================================================

class TestOGParser:
    """Test OGParser HTML parsing."""

    def test_parses_og_title_description_image(self):
        html = """
        <html><head>
        <meta property="og:title" content="Test Title">
        <meta property="og:description" content="Test Description">
        <meta property="og:image" content="https://example.com/image.jpg">
        </head></html>
        """
        parser = OGParser()
        parser.feed(html)
        assert parser.og['title'] == 'Test Title'
        assert parser.og['description'] == 'Test Description'
        assert parser.og['image'] == 'https://example.com/image.jpg'

    def test_fallback_to_title_tag_and_meta_description(self):
        html = """
        <html><head>
        <title>Fallback Title</title>
        <meta name="description" content="Fallback Description">
        </head></html>
        """
        parser = OGParser()
        parser.feed(html)
        assert parser.title == 'Fallback Title'
        assert parser.og.get('description') == 'Fallback Description'
        assert 'title' not in parser.og  # og:title not set

    def test_empty_html_returns_title_from_title_tag(self):
        html = "<html><head><title>Only Title</title></head></html>"
        parser = OGParser()
        parser.feed(html)
        assert parser.title == 'Only Title'
        assert parser.og.get('description') is None
        assert parser.og.get('image') is None

    def test_no_meta_tags_empty_og(self):
        html = "<html><head></head><body>No meta here</body></html>"
        parser = OGParser()
        parser.feed(html)
        assert parser.og == {}
        assert parser.title is None

    def test_og_description_takes_precedence_over_meta_description(self):
        html = """
        <html><head>
        <meta property="og:description" content="OG Desc">
        <meta name="description" content="Meta Desc">
        </head></html>
        """
        parser = OGParser()
        parser.feed(html)
        assert parser.og['description'] == 'OG Desc'


# ============================================================
# fetch_link_preview Tests
# ============================================================

class TestFetchLinkPreview:
    """Test fetch_link_preview function."""

    def test_returns_none_for_none_text(self):
        result = fetch_link_preview(None)
        assert result is None

    def test_returns_none_for_empty_text(self):
        result = fetch_link_preview('')
        assert result is None

    def test_returns_none_when_no_url_in_text(self):
        result = fetch_link_preview('Cześć, jak się masz?')
        assert result is None

    def test_returns_none_for_internal_nordabiznes_url(self):
        result = fetch_link_preview('Sprawdź https://nordabiznes.pl/company/test')
        assert result is None

    def test_returns_none_for_staging_internal_url(self):
        result = fetch_link_preview('Link: https://staging.nordabiznes.pl/company/foo')
        assert result is None

    def test_returns_none_for_localhost_url(self):
        result = fetch_link_preview('Dev: http://localhost:5000/test')
        assert result is None

    def test_success_returns_dict_with_og_data(self):
        html = """<html><head>
        <meta property="og:title" content="Example Title">
        <meta property="og:description" content="Example Description">
        <meta property="og:image" content="https://example.com/img.jpg">
        </head></html>"""

        mock_resp = MagicMock()
        mock_resp.headers = {'content-type': 'text/html; charset=utf-8'}
        mock_resp.text = html
        mock_resp.raise_for_status = MagicMock()

        with patch('blueprints.messages.link_preview.requests.get', return_value=mock_resp):
            result = fetch_link_preview('Check out https://example.com')

        assert result is not None
        assert result['url'] == 'https://example.com'
        assert result['title'] == 'Example Title'
        assert result['description'] == 'Example Description'
        assert result['image'] == 'https://example.com/img.jpg'

    def test_success_uses_title_tag_fallback(self):
        html = "<html><head><title>Page Title</title></head></html>"

        mock_resp = MagicMock()
        mock_resp.headers = {'content-type': 'text/html'}
        mock_resp.text = html
        mock_resp.raise_for_status = MagicMock()

        with patch('blueprints.messages.link_preview.requests.get', return_value=mock_resp):
            result = fetch_link_preview('See https://example.com for details')

        assert result is not None
        assert result['title'] == 'Page Title'

    def test_returns_none_on_timeout(self):
        with patch('blueprints.messages.link_preview.requests.get', side_effect=Timeout):
            result = fetch_link_preview('Visit https://slow-site.example.com')
        assert result is None

    def test_returns_none_for_non_html_content_type(self):
        mock_resp = MagicMock()
        mock_resp.headers = {'content-type': 'application/pdf'}
        mock_resp.text = '%PDF-1.4 binary content'
        mock_resp.raise_for_status = MagicMock()

        with patch('blueprints.messages.link_preview.requests.get', return_value=mock_resp):
            result = fetch_link_preview('Download https://example.com/doc.pdf')
        assert result is None

    def test_returns_none_when_page_has_no_title(self):
        html = "<html><head><meta name='robots' content='noindex'></head></html>"

        mock_resp = MagicMock()
        mock_resp.headers = {'content-type': 'text/html'}
        mock_resp.text = html
        mock_resp.raise_for_status = MagicMock()

        with patch('blueprints.messages.link_preview.requests.get', return_value=mock_resp):
            result = fetch_link_preview('Visit https://example.com')
        assert result is None

    def test_title_truncated_to_200_chars(self):
        long_title = 'A' * 300
        html = f"<html><head><title>{long_title}</title></head></html>"

        mock_resp = MagicMock()
        mock_resp.headers = {'content-type': 'text/html'}
        mock_resp.text = html
        mock_resp.raise_for_status = MagicMock()

        with patch('blueprints.messages.link_preview.requests.get', return_value=mock_resp):
            result = fetch_link_preview('https://example.com')

        assert result is not None
        assert len(result['title']) <= 200

    def test_description_truncated_to_300_chars(self):
        long_desc = 'B' * 400
        html = f"""<html><head>
        <title>Title</title>
        <meta name="description" content="{long_desc}">
        </head></html>"""

        mock_resp = MagicMock()
        mock_resp.headers = {'content-type': 'text/html'}
        mock_resp.text = html
        mock_resp.raise_for_status = MagicMock()

        with patch('blueprints.messages.link_preview.requests.get', return_value=mock_resp):
            result = fetch_link_preview('https://example.com')

        assert result is not None
        assert len(result['description']) <= 300


# ============================================================
# URL Extraction from HTML Content Tests
# ============================================================

class TestURLExtractionFromHTML:
    """Test that URLs inside HTML anchor tags are correctly found."""

    def test_extracts_url_from_anchor_tag(self):
        """URL inside <a href> is extracted after stripping HTML tags."""
        text = '<a href="https://external-site.com/page">Visit site</a>'
        # The function strips HTML tags before extracting URLs,
        # so href URL is not extracted — only bare URLs in text are.
        # This test verifies the stripping behavior: no URL in visible text → None.
        result = fetch_link_preview(text)
        # After stripping tags, text is "Visit site" — no URL → None
        assert result is None

    def test_extracts_bare_url_from_mixed_html(self):
        """Bare URL in text alongside HTML is extracted correctly."""
        text = '<p>Check out https://example.com/news for more</p>'

        mock_resp = MagicMock()
        mock_resp.headers = {'content-type': 'text/html'}
        mock_resp.text = '<html><head><title>News</title></head></html>'
        mock_resp.raise_for_status = MagicMock()

        with patch('blueprints.messages.link_preview.requests.get', return_value=mock_resp):
            result = fetch_link_preview(text)

        assert result is not None
        assert result['url'] == 'https://example.com/news'

    def test_first_url_is_used_when_multiple_urls_present(self):
        """When text contains multiple URLs, the first one is used."""
        text = 'First: https://first.example.com and second: https://second.example.com'

        mock_resp = MagicMock()
        mock_resp.headers = {'content-type': 'text/html'}
        mock_resp.text = '<html><head><title>First</title></head></html>'
        mock_resp.raise_for_status = MagicMock()

        with patch('blueprints.messages.link_preview.requests.get', return_value=mock_resp):
            result = fetch_link_preview(text)

        assert result is not None
        assert result['url'] == 'https://first.example.com'