bb-plane-fork/apps/api/plane/utils/url.py

# Copyright (c) 2023-present Plane Software, Inc. and contributors
# SPDX-License-Identifier: AGPL-3.0-only
# See the LICENSE file for details.

# Python imports
import re
from typing import Optional
from urllib.parse import urlparse, urlunparse

# Compiled regex pattern for better performance and ReDoS protection
# Using atomic groups and length limits to prevent excessive backtracking
URL_PATTERN = re.compile(
    r"(?i)"  # Case insensitive
    r"(?:"  # Non-capturing group for alternatives
    r"https?://[^\s]+"  # http:// or https:// followed by non-whitespace
    r"|"
    r"www\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*"  # noqa: E501
    r"|"
    r"(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,6}"  # noqa: E501
    r"|"
    r"(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)"  # noqa: E501
    r")"
)


def contains_url(value: str) -> bool:
    """
    Check if the value contains a URL.

    This function is protected against ReDoS attacks by:
    1. Using a pre-compiled regex pattern
    2. Limiting input length to prevent excessive processing
    3. Using atomic groups and specific quantifiers to avoid backtracking

    Args:
        value (str): The input string to check for URLs

    Returns:
        bool: True if the string contains a URL, False otherwise
    """
    # Prevent ReDoS by limiting input length
    if len(value) > 1000:  # Reasonable limit for URL detection
        return False

    # Additional safety: truncate very long lines that might contain URLs
    lines = value.split("\n")
    for line in lines:
        if len(line) > 500:  # Process only reasonable length lines
            line = line[:500]
        if URL_PATTERN.search(line):
            return True

    return False


def is_valid_url(url: str) -> bool:
    """
    Validates whether the given string is a well-formed URL.

    Args:
        url (str): The URL string to validate.

    Returns:
        bool: True if the URL is valid, False otherwise.

    Example:
        >>> is_valid_url("https://example.com")
        True
        >>> is_valid_url("not a url")
        False
    """
    try:
        result = urlparse(url)
        # A valid URL should have at least scheme and netloc
        return all([result.scheme, result.netloc])
    except TypeError:
        return False


def get_url_components(url: str) -> Optional[dict]:
    """
    Parses the URL and returns its components if valid.

    Args:
        url (str): The URL string to parse.

    Returns:
        Optional[dict]: A dictionary with URL components if valid, None otherwise.

    Example:
        >>> get_url_components("https://example.com/path?query=1")
        {
        'scheme': 'https', 'netloc': 'example.com',
        'path': '/path', 'params': '',
        'query': 'query=1', 'fragment': ''}
    """
    if not is_valid_url(url):
        return None
    result = urlparse(url)
    return {
        "scheme": result.scheme,
        "netloc": result.netloc,
        "path": result.path,
        "params": result.params,
        "query": result.query,
        "fragment": result.fragment,
    }


def normalize_url_path(url: str) -> str:
    """
    Normalize the path component of a URL by
    replacing multiple consecutive slashes with a single slash.

    This function preserves the protocol, domain,
    query parameters, and fragments of the URL,
    only modifying the path portion to ensure there are no duplicate slashes.

    Args:
        url (str): The input URL string to normalize.

    Returns:
        str: The normalized URL with redundant slashes in the path removed.

    Example:
        >>> normalize_url_path('https://example.com//foo///bar//baz?x=1#frag')
        'https://example.com/foo/bar/baz?x=1#frag'
    """
    parts = urlparse(url)
    # Normalize the path
    normalized_path = re.sub(r"/+", "/", parts.path)
    # Reconstruct the URL
    return urlunparse(parts._replace(path=normalized_path))