133 lines
3.9 KiB
Python
133 lines
3.9 KiB
Python
# Copyright (c) 2023-present Plane Software, Inc. and contributors
|
|
# SPDX-License-Identifier: AGPL-3.0-only
|
|
# See the LICENSE file for details.
|
|
|
|
# Python imports
|
|
import re
|
|
from typing import Optional
|
|
from urllib.parse import urlparse, urlunparse
|
|
|
|
# Compiled regex pattern for better performance and ReDoS protection
|
|
# Using atomic groups and length limits to prevent excessive backtracking
|
|
URL_PATTERN = re.compile(
|
|
r"(?i)" # Case insensitive
|
|
r"(?:" # Non-capturing group for alternatives
|
|
r"https?://[^\s]+" # http:// or https:// followed by non-whitespace
|
|
r"|"
|
|
r"www\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*" # noqa: E501
|
|
r"|"
|
|
r"(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,6}" # noqa: E501
|
|
r"|"
|
|
r"(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)" # noqa: E501
|
|
r")"
|
|
)
|
|
|
|
|
|
def contains_url(value: str) -> bool:
|
|
"""
|
|
Check if the value contains a URL.
|
|
|
|
This function is protected against ReDoS attacks by:
|
|
1. Using a pre-compiled regex pattern
|
|
2. Limiting input length to prevent excessive processing
|
|
3. Using atomic groups and specific quantifiers to avoid backtracking
|
|
|
|
Args:
|
|
value (str): The input string to check for URLs
|
|
|
|
Returns:
|
|
bool: True if the string contains a URL, False otherwise
|
|
"""
|
|
# Prevent ReDoS by limiting input length
|
|
if len(value) > 1000: # Reasonable limit for URL detection
|
|
return False
|
|
|
|
# Additional safety: truncate very long lines that might contain URLs
|
|
lines = value.split("\n")
|
|
for line in lines:
|
|
if len(line) > 500: # Process only reasonable length lines
|
|
line = line[:500]
|
|
if URL_PATTERN.search(line):
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
def is_valid_url(url: str) -> bool:
|
|
"""
|
|
Validates whether the given string is a well-formed URL.
|
|
|
|
Args:
|
|
url (str): The URL string to validate.
|
|
|
|
Returns:
|
|
bool: True if the URL is valid, False otherwise.
|
|
|
|
Example:
|
|
>>> is_valid_url("https://example.com")
|
|
True
|
|
>>> is_valid_url("not a url")
|
|
False
|
|
"""
|
|
try:
|
|
result = urlparse(url)
|
|
# A valid URL should have at least scheme and netloc
|
|
return all([result.scheme, result.netloc])
|
|
except TypeError:
|
|
return False
|
|
|
|
|
|
def get_url_components(url: str) -> Optional[dict]:
|
|
"""
|
|
Parses the URL and returns its components if valid.
|
|
|
|
Args:
|
|
url (str): The URL string to parse.
|
|
|
|
Returns:
|
|
Optional[dict]: A dictionary with URL components if valid, None otherwise.
|
|
|
|
Example:
|
|
>>> get_url_components("https://example.com/path?query=1")
|
|
{
|
|
'scheme': 'https', 'netloc': 'example.com',
|
|
'path': '/path', 'params': '',
|
|
'query': 'query=1', 'fragment': ''}
|
|
"""
|
|
if not is_valid_url(url):
|
|
return None
|
|
result = urlparse(url)
|
|
return {
|
|
"scheme": result.scheme,
|
|
"netloc": result.netloc,
|
|
"path": result.path,
|
|
"params": result.params,
|
|
"query": result.query,
|
|
"fragment": result.fragment,
|
|
}
|
|
|
|
|
|
def normalize_url_path(url: str) -> str:
|
|
"""
|
|
Normalize the path component of a URL by
|
|
replacing multiple consecutive slashes with a single slash.
|
|
|
|
This function preserves the protocol, domain,
|
|
query parameters, and fragments of the URL,
|
|
only modifying the path portion to ensure there are no duplicate slashes.
|
|
|
|
Args:
|
|
url (str): The input URL string to normalize.
|
|
|
|
Returns:
|
|
str: The normalized URL with redundant slashes in the path removed.
|
|
|
|
Example:
|
|
>>> normalize_url_path('https://example.com//foo///bar//baz?x=1#frag')
|
|
'https://example.com/foo/bar/baz?x=1#frag'
|
|
"""
|
|
parts = urlparse(url)
|
|
# Normalize the path
|
|
normalized_path = re.sub(r"/+", "/", parts.path)
|
|
# Reconstruct the URL
|
|
return urlunparse(parts._replace(path=normalized_path))
|