bb-plane-fork/apps/api/plane/utils/content_validator.py
Bavisetti Narayan 0af75897f5
[WEB-4780] chore: changed the html validation (#7648)
* chore: changed the html validation

* chore: added requirements for nh3

* chore: removed the json validations
2025-08-27 00:38:25 +05:30

80 lines
2.3 KiB
Python

# Python imports
import base64
import nh3
from plane.utils.exception_logger import log_exception
# Maximum allowed size for binary data (10MB)
MAX_SIZE = 10 * 1024 * 1024
# Suspicious patterns for binary data content
SUSPICIOUS_BINARY_PATTERNS = [
"<html",
"<!doctype",
"<script",
"javascript:",
"data:",
"<iframe",
]
def validate_binary_data(data):
"""
Validate that binary data appears to be valid document format and doesn't contain malicious content.
Args:
data (bytes or str): The binary data to validate, or base64-encoded string
Returns:
tuple: (is_valid: bool, error_message: str or None)
"""
if not data:
return True, None # Empty is OK
# Handle base64-encoded strings by decoding them first
if isinstance(data, str):
try:
binary_data = base64.b64decode(data)
except Exception:
return False, "Invalid base64 encoding"
else:
binary_data = data
# Size check - 10MB limit
if len(binary_data) > MAX_SIZE:
return False, "Binary data exceeds maximum size limit (10MB)"
# Basic format validation
if len(binary_data) < 4:
return False, "Binary data too short to be valid document format"
# Check for suspicious text patterns (HTML/JS)
try:
decoded_text = binary_data.decode("utf-8", errors="ignore")[:200]
if any(
pattern in decoded_text.lower() for pattern in SUSPICIOUS_BINARY_PATTERNS
):
return False, "Binary data contains suspicious content patterns"
except Exception:
pass # Binary data might not be decodable as text, which is fine
return True, None
def validate_html_content(html_content: str):
"""
Sanitize HTML content using nh3.
Returns a tuple: (is_valid, error_message, clean_html)
"""
if not html_content:
return True, None, None
# Size check - 10MB limit (consistent with binary validation)
if len(html_content.encode("utf-8")) > MAX_SIZE:
return False, "HTML content exceeds maximum size limit (10MB)", None
try:
clean_html = nh3.clean(html_content)
return True, None, clean_html
except Exception as e:
log_exception(e)
return False, "Failed to sanitize HTML", None