* chore: changed the html validation * chore: added requirements for nh3 * chore: removed the json validations
80 lines
2.3 KiB
Python
80 lines
2.3 KiB
Python
# Python imports
|
|
import base64
|
|
import nh3
|
|
from plane.utils.exception_logger import log_exception
|
|
|
|
# Maximum allowed size for binary data (10MB)
|
|
MAX_SIZE = 10 * 1024 * 1024
|
|
|
|
# Suspicious patterns for binary data content
|
|
SUSPICIOUS_BINARY_PATTERNS = [
|
|
"<html",
|
|
"<!doctype",
|
|
"<script",
|
|
"javascript:",
|
|
"data:",
|
|
"<iframe",
|
|
]
|
|
|
|
|
|
def validate_binary_data(data):
|
|
"""
|
|
Validate that binary data appears to be valid document format and doesn't contain malicious content.
|
|
|
|
Args:
|
|
data (bytes or str): The binary data to validate, or base64-encoded string
|
|
|
|
Returns:
|
|
tuple: (is_valid: bool, error_message: str or None)
|
|
"""
|
|
if not data:
|
|
return True, None # Empty is OK
|
|
|
|
# Handle base64-encoded strings by decoding them first
|
|
if isinstance(data, str):
|
|
try:
|
|
binary_data = base64.b64decode(data)
|
|
except Exception:
|
|
return False, "Invalid base64 encoding"
|
|
else:
|
|
binary_data = data
|
|
|
|
# Size check - 10MB limit
|
|
if len(binary_data) > MAX_SIZE:
|
|
return False, "Binary data exceeds maximum size limit (10MB)"
|
|
|
|
# Basic format validation
|
|
if len(binary_data) < 4:
|
|
return False, "Binary data too short to be valid document format"
|
|
|
|
# Check for suspicious text patterns (HTML/JS)
|
|
try:
|
|
decoded_text = binary_data.decode("utf-8", errors="ignore")[:200]
|
|
if any(
|
|
pattern in decoded_text.lower() for pattern in SUSPICIOUS_BINARY_PATTERNS
|
|
):
|
|
return False, "Binary data contains suspicious content patterns"
|
|
except Exception:
|
|
pass # Binary data might not be decodable as text, which is fine
|
|
|
|
return True, None
|
|
|
|
|
|
def validate_html_content(html_content: str):
|
|
"""
|
|
Sanitize HTML content using nh3.
|
|
Returns a tuple: (is_valid, error_message, clean_html)
|
|
"""
|
|
if not html_content:
|
|
return True, None, None
|
|
|
|
# Size check - 10MB limit (consistent with binary validation)
|
|
if len(html_content.encode("utf-8")) > MAX_SIZE:
|
|
return False, "HTML content exceeds maximum size limit (10MB)", None
|
|
|
|
try:
|
|
clean_html = nh3.clean(html_content)
|
|
return True, None, clean_html
|
|
except Exception as e:
|
|
log_exception(e)
|
|
return False, "Failed to sanitize HTML", None
|