# Python imports import base64 import json import re # Maximum allowed size for binary data (10MB) MAX_SIZE = 10 * 1024 * 1024 # Maximum recursion depth to prevent stack overflow MAX_RECURSION_DEPTH = 20 # Dangerous text patterns that could indicate XSS or script injection DANGEROUS_TEXT_PATTERNS = [ r"]*>.*?", r"javascript\s*:", r"data\s*:\s*text/html", r"eval\s*\(", r"document\s*\.", r"window\s*\.", r"location\s*\.", ] # Dangerous attribute patterns for HTML attributes DANGEROUS_ATTR_PATTERNS = [ r"javascript\s*:", r"data\s*:\s*text/html", r"eval\s*\(", r"alert\s*\(", r"document\s*\.", r"window\s*\.", ] # Suspicious patterns for binary data content SUSPICIOUS_BINARY_PATTERNS = [ "]*>", r"", # JavaScript URLs in various attributes r'(?:href|src|action)\s*=\s*["\']?\s*javascript:', # Data URLs with text/html (potential XSS) r'(?:href|src|action)\s*=\s*["\']?\s*data:text/html', # Dangerous event handlers with JavaScript-like content r'on(?:load|error|click|focus|blur|change|submit|reset|select|resize|scroll|unload|beforeunload|hashchange|popstate|storage|message|offline|online)\s*=\s*["\']?[^"\']*(?:javascript|alert|eval|document\.|window\.|location\.|history\.)[^"\']*["\']?', # Object and embed tags that could load external content r"<(?:object|embed)[^>]*(?:data|src)\s*=", # Base tag that could change relative URL resolution r"]*href\s*=", # Dangerous iframe sources r']*src\s*=\s*["\']?(?:javascript:|data:text/html)', # Meta refresh redirects r']*http-equiv\s*=\s*["\']?refresh["\']?', # Link tags - simplified patterns r']*rel\s*=\s*["\']?stylesheet["\']?', r']*href\s*=\s*["\']?https?://', r']*href\s*=\s*["\']?//', r']*href\s*=\s*["\']?(?:data:|javascript:)', # Style tags with external imports r"]*>.*?@import.*?(?:https?://|//)", # Link tags with dangerous rel types r']*rel\s*=\s*["\']?(?:import|preload|prefetch|dns-prefetch|preconnect)["\']?', # Forms with action attributes r"]*action\s*=", ] # Dangerous JavaScript patterns for event handlers DANGEROUS_JS_PATTERNS = [ r"alert\s*\(", r"eval\s*\(", r"document\s*\.", r"window\s*\.", r"location\s*\.", r"fetch\s*\(", r"XMLHttpRequest", r"innerHTML\s*=", r"outerHTML\s*=", r"document\.write", r"script\s*>", ] # HTML self-closing tags that don't need closing tags SELF_CLOSING_TAGS = { "img", "br", "hr", "input", "meta", "link", "area", "base", "col", "embed", "source", "track", "wbr", } def validate_binary_data(data): """ Validate that binary data appears to be valid document format and doesn't contain malicious content. Args: data (bytes or str): The binary data to validate, or base64-encoded string Returns: tuple: (is_valid: bool, error_message: str or None) """ if not data: return True, None # Empty is OK # Handle base64-encoded strings by decoding them first if isinstance(data, str): try: binary_data = base64.b64decode(data) except Exception: return False, "Invalid base64 encoding" else: binary_data = data # Size check - 10MB limit if len(binary_data) > MAX_SIZE: return False, "Binary data exceeds maximum size limit (10MB)" # Basic format validation if len(binary_data) < 4: return False, "Binary data too short to be valid document format" # Check for suspicious text patterns (HTML/JS) try: decoded_text = binary_data.decode("utf-8", errors="ignore")[:200] if any( pattern in decoded_text.lower() for pattern in SUSPICIOUS_BINARY_PATTERNS ): return False, "Binary data contains suspicious content patterns" except Exception: pass # Binary data might not be decodable as text, which is fine return True, None def validate_html_content(html_content): """ Validate that HTML content is safe and doesn't contain malicious patterns. Args: html_content (str): The HTML content to validate Returns: tuple: (is_valid: bool, error_message: str or None) """ if not html_content: return True, None # Empty is OK # Size check - 10MB limit (consistent with binary validation) if len(html_content.encode("utf-8")) > MAX_SIZE: return False, "HTML content exceeds maximum size limit (10MB)" # Check for specific malicious patterns (simplified and more reliable) for pattern in MALICIOUS_HTML_PATTERNS: if re.search(pattern, html_content, re.IGNORECASE | re.DOTALL): return ( False, f"HTML content contains potentially malicious patterns: {pattern}", ) # Additional check for inline event handlers that contain suspicious content # This is more permissive - only blocks if the event handler contains actual dangerous code event_handler_pattern = r'on\w+\s*=\s*["\']([^"\']*)["\']' event_matches = re.findall(event_handler_pattern, html_content, re.IGNORECASE) for handler_content in event_matches: for js_pattern in DANGEROUS_JS_PATTERNS: if re.search(js_pattern, handler_content, re.IGNORECASE): return ( False, f"HTML content contains dangerous JavaScript in event handler: {handler_content[:100]}", ) # Basic HTML structure validation - check for common malformed tags try: # Count opening and closing tags for basic structure validation opening_tags = re.findall(r"<(\w+)[^>]*>", html_content) closing_tags = re.findall(r"", html_content) # Filter out self-closing tags from opening tags opening_tags_filtered = [ tag for tag in opening_tags if tag.lower() not in SELF_CLOSING_TAGS ] # Basic check - if we have significantly more opening than closing tags, it might be malformed if len(opening_tags_filtered) > len(closing_tags) + 10: # Allow some tolerance return False, "HTML content appears to be malformed (unmatched tags)" except Exception: # If HTML parsing fails, we'll allow it pass return True, None def validate_json_content(json_content): """ Validate that JSON content is safe and doesn't contain malicious patterns. Args: json_content (dict): The JSON content to validate Returns: tuple: (is_valid: bool, error_message: str or None) """ if not json_content: return True, None # Empty is OK try: # Size check - 10MB limit (consistent with other validations) json_str = json.dumps(json_content) if len(json_str.encode("utf-8")) > MAX_SIZE: return False, "JSON content exceeds maximum size limit (10MB)" # Basic structure validation for page description JSON if isinstance(json_content, dict): # Check for expected page description structure # This is based on ProseMirror/Tiptap JSON structure if "type" in json_content and json_content.get("type") == "doc": # Valid document structure if "content" in json_content and isinstance( json_content["content"], list ): # Recursively check content for suspicious patterns is_valid, error_msg = _validate_json_content_array( json_content["content"] ) if not is_valid: return False, error_msg elif "type" not in json_content and "content" not in json_content: # Allow other JSON structures but validate for suspicious content is_valid, error_msg = _validate_json_content_recursive(json_content) if not is_valid: return False, error_msg else: return False, "JSON description must be a valid object" except (TypeError, ValueError) as e: return False, "Invalid JSON structure" except Exception as e: return False, "Failed to validate JSON content" return True, None def _validate_json_content_array(content, depth=0): """ Validate JSON content array for suspicious patterns. Args: content (list): Array of content nodes to validate depth (int): Current recursion depth (default: 0) Returns: tuple: (is_valid: bool, error_message: str or None) """ # Check recursion depth to prevent stack overflow if depth > MAX_RECURSION_DEPTH: return False, f"Maximum recursion depth ({MAX_RECURSION_DEPTH}) exceeded" if not isinstance(content, list): return True, None for node in content: if isinstance(node, dict): # Check text content for suspicious patterns (more targeted) if node.get("type") == "text" and "text" in node: text_content = node["text"] for pattern in DANGEROUS_TEXT_PATTERNS: if re.search(pattern, text_content, re.IGNORECASE): return ( False, "JSON content contains suspicious script patterns in text", ) # Check attributes for suspicious content (more targeted) if "attrs" in node and isinstance(node["attrs"], dict): for attr_name, attr_value in node["attrs"].items(): if isinstance(attr_value, str): # Only check specific attributes that could be dangerous if attr_name.lower() in [ "href", "src", "action", "onclick", "onload", "onerror", ]: for pattern in DANGEROUS_ATTR_PATTERNS: if re.search(pattern, attr_value, re.IGNORECASE): return ( False, f"JSON content contains dangerous pattern in {attr_name} attribute", ) # Recursively check nested content if "content" in node and isinstance(node["content"], list): is_valid, error_msg = _validate_json_content_array( node["content"], depth + 1 ) if not is_valid: return False, error_msg return True, None def _validate_json_content_recursive(obj, depth=0): """ Recursively validate JSON object for suspicious content. Args: obj: JSON object (dict, list, or primitive) to validate depth (int): Current recursion depth (default: 0) Returns: tuple: (is_valid: bool, error_message: str or None) """ # Check recursion depth to prevent stack overflow if depth > MAX_RECURSION_DEPTH: return False, f"Maximum recursion depth ({MAX_RECURSION_DEPTH}) exceeded" if isinstance(obj, dict): for key, value in obj.items(): if isinstance(value, str): # Check for dangerous patterns using module constants for pattern in DANGEROUS_TEXT_PATTERNS: if re.search(pattern, value, re.IGNORECASE): return ( False, "JSON content contains suspicious script patterns", ) elif isinstance(value, (dict, list)): is_valid, error_msg = _validate_json_content_recursive(value, depth + 1) if not is_valid: return False, error_msg elif isinstance(obj, list): for item in obj: is_valid, error_msg = _validate_json_content_recursive(item, depth + 1) if not is_valid: return False, error_msg return True, None