nordabiz/utils/helpers.py

"""
Helper Functions
================

Common utility functions used across blueprints.
"""

import re
import logging

import bleach

logger = logging.getLogger(__name__)

# Allowed HTML tags and attributes for rich-text content (announcements, events, proceedings)
_ALLOWED_TAGS = ['p', 'br', 'strong', 'em', 'b', 'i', 'a', 'ul', 'ol', 'li', 'h3', 'h4', 'blockquote', 'img']
_ALLOWED_ATTRS = {'a': ['href', 'target', 'rel'], 'img': ['src', 'alt']}


def sanitize_html(content):
    """
    Sanitize HTML content to prevent stored XSS.
    Strips all tags except a safe whitelist.

    Args:
        content: HTML string to sanitize

    Returns:
        Sanitized HTML string
    """
    if not content:
        return content
    return bleach.clean(content, tags=_ALLOWED_TAGS, attributes=_ALLOWED_ATTRS, strip=True)


def linkify_urls(html):
    """
    Auto-link URLs in HTML content that are not already inside <a> or <img> tags.
    Links to nordabiznes.pl open in new tab as trusted internal links.
    """
    if not html:
        return html

    from markupsafe import Markup

    # Split HTML into tags and text, only process text outside <a>/<img> tags
    url_pattern = re.compile(r'(https?://[^\s<>"\']+)')
    tag_pattern = re.compile(r'<(/?)(\w+)([^>]*)>')

    result = []
    pos = 0
    in_a_tag = False

    for match in tag_pattern.finditer(html):
        start, end = match.start(), match.end()
        is_closing = match.group(1) == '/'
        tag_name = match.group(2).lower()

        # Process text before this tag
        if start > pos:
            text_chunk = html[pos:start]
            if in_a_tag:
                result.append(text_chunk)
            else:
                result.append(url_pattern.sub(
                    lambda m: '<a href="{0}" target="_blank" style="color:var(--primary);word-break:break-all;">{0}</a>'.format(m.group(0)),
                    text_chunk
                ))

        result.append(match.group(0))
        pos = end

        if tag_name in ('a', 'img'):
            in_a_tag = not is_closing

    # Process remaining text
    if pos < len(html):
        text_chunk = html[pos:]
        if not in_a_tag:
            text_chunk = url_pattern.sub(
                lambda m: '<a href="{0}" target="_blank" style="color:var(--primary);word-break:break-all;">{0}</a>'.format(m.group(0)),
                text_chunk
            )
        result.append(text_chunk)

    return Markup(''.join(result))


def sanitize_input(text, max_length=1000):
    """
    Sanitize user input - remove potentially dangerous characters.

    Args:
        text: Input string to sanitize
        max_length: Maximum allowed length (default 1000)

    Returns:
        Sanitized string
    """
    if not text:
        return ""

    # Remove null bytes
    text = text.replace('\x00', '')

    # Trim to max length
    text = text[:max_length]

    # Strip whitespace
    text = text.strip()

    return text


def validate_email(email):
    """
    Validate email format.

    Args:
        email: Email address to validate

    Returns:
        bool: True if valid, False otherwise
    """
    if not email or len(email) > 255:
        return False

    # RFC 5322 compliant email regex (simplified)
    pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
    return re.match(pattern, email) is not None


def validate_password(password):
    """
    Validate password strength.

    Requirements:
    - Minimum 8 characters
    - At least one uppercase letter
    - At least one lowercase letter
    - At least one digit

    Args:
        password: Password to validate

    Returns:
        tuple: (is_valid: bool, message: str)
    """
    if not password or len(password) < 8:
        return False, "Hasło musi mieć minimum 8 znaków"

    if not re.search(r'[A-Z]', password):
        return False, "Hasło musi zawierać przynajmniej jedną wielką literę"

    if not re.search(r'[a-z]', password):
        return False, "Hasło musi zawierać przynajmniej jedną małą literę"

    if not re.search(r'\d', password):
        return False, "Hasło musi zawierać przynajmniej jedną cyfrę"

    return True, "OK"


def ensure_url(url):
    """
    Ensure URL has http:// or https:// scheme.

    Args:
        url: URL string

    Returns:
        URL with https:// prefix if no scheme present
    """
    if url and not url.startswith(('http://', 'https://')):
        return f'https://{url}'
    return url