# Python imports import base64 import nh3 from plane.utils.exception_logger import log_exception # Maximum allowed size for binary data (10MB) MAX_SIZE = 10 * 1024 * 1024 # Suspicious patterns for binary data content SUSPICIOUS_BINARY_PATTERNS = [ " MAX_SIZE: return False, "Binary data exceeds maximum size limit (10MB)" # Basic format validation if len(binary_data) < 4: return False, "Binary data too short to be valid document format" # Check for suspicious text patterns (HTML/JS) try: decoded_text = binary_data.decode("utf-8", errors="ignore")[:200] if any( pattern in decoded_text.lower() for pattern in SUSPICIOUS_BINARY_PATTERNS ): return False, "Binary data contains suspicious content patterns" except Exception: pass # Binary data might not be decodable as text, which is fine return True, None def validate_html_content(html_content: str): """ Sanitize HTML content using nh3. Returns a tuple: (is_valid, error_message, clean_html) """ if not html_content: return True, None, None # Size check - 10MB limit (consistent with binary validation) if len(html_content.encode("utf-8")) > MAX_SIZE: return False, "HTML content exceeds maximum size limit (10MB)", None try: clean_html = nh3.clean(html_content) return True, None, clean_html except Exception as e: log_exception(e) return False, "Failed to sanitize HTML", None