[WEB-4780] chore: changed the html validation (#7648)

* chore: changed the html validation * chore: added requirements for nh3 * chore: removed the json validations
2025-08-27 00:38:25 +05:30 · 2025-08-27 00:38:25 +05:30 · 0af75897f5
commit 0af75897f5
parent 3602ff6930
10 changed files with 91 additions and 366 deletions
--- a/apps/api/plane/utils/content_validator.py
+++ b/apps/api/plane/utils/content_validator.py
@ -1,36 +1,11 @@
 # Python imports
 import base64
-import json
-import re
-
+import nh3
+from plane.utils.exception_logger import log_exception

 # Maximum allowed size for binary data (10MB)
 MAX_SIZE = 10 * 1024 * 1024

-# Maximum recursion depth to prevent stack overflow
-MAX_RECURSION_DEPTH = 20
-
-# Dangerous text patterns that could indicate XSS or script injection
-DANGEROUS_TEXT_PATTERNS = [
-    r"<script[^>]*>.*?</script>",
-    r"javascript\s*:",
-    r"data\s*:\s*text/html",
-    r"eval\s*\(",
-    r"document\s*\.",
-    r"window\s*\.",
-    r"location\s*\.",
-]
-
-# Dangerous attribute patterns for HTML attributes
-DANGEROUS_ATTR_PATTERNS = [
-    r"javascript\s*:",
-    r"data\s*:\s*text/html",
-    r"eval\s*\(",
-    r"alert\s*\(",
-    r"document\s*\.",
-    r"window\s*\.",
-]
-
 # Suspicious patterns for binary data content
 SUSPICIOUS_BINARY_PATTERNS = [
    "<html",
@ -41,70 +16,6 @@ SUSPICIOUS_BINARY_PATTERNS = [
    "<iframe",
 ]

-# Malicious HTML patterns for content validation
-MALICIOUS_HTML_PATTERNS = [
-    # Script tags with any content
-    r"<script[^>]*>",
-    r"</script>",
-    # JavaScript URLs in various attributes
-    r'(?:href|src|action)\s*=\s*["\']?\s*javascript:',
-    # Data URLs with text/html (potential XSS)
-    r'(?:href|src|action)\s*=\s*["\']?\s*data:text/html',
-    # Dangerous event handlers with JavaScript-like content
-    r'on(?:load|error|click|focus|blur|change|submit|reset|select|resize|scroll|unload|beforeunload|hashchange|popstate|storage|message|offline|online)\s*=\s*["\']?[^"\']*(?:javascript|alert|eval|document\.|window\.|location\.|history\.)[^"\']*["\']?',
-    # Object and embed tags that could load external content
-    r"<(?:object|embed)[^>]*(?:data|src)\s*=",
-    # Base tag that could change relative URL resolution
-    r"<base[^>]*href\s*=",
-    # Dangerous iframe sources
-    r'<iframe[^>]*src\s*=\s*["\']?(?:javascript:|data:text/html)',
-    # Meta refresh redirects
-    r'<meta[^>]*http-equiv\s*=\s*["\']?refresh["\']?',
-    # Link tags - simplified patterns
-    r'<link[^>]*rel\s*=\s*["\']?stylesheet["\']?',
-    r'<link[^>]*href\s*=\s*["\']?https?://',
-    r'<link[^>]*href\s*=\s*["\']?//',
-    r'<link[^>]*href\s*=\s*["\']?(?:data:|javascript:)',
-    # Style tags with external imports
-    r"<style[^>]*>.*?@import.*?(?:https?://|//)",
-    # Link tags with dangerous rel types
-    r'<link[^>]*rel\s*=\s*["\']?(?:import|preload|prefetch|dns-prefetch|preconnect)["\']?',
-    # Forms with action attributes
-    r"<form[^>]*action\s*=",
-]
-
-# Dangerous JavaScript patterns for event handlers
-DANGEROUS_JS_PATTERNS = [
-    r"alert\s*\(",
-    r"eval\s*\(",
-    r"document\s*\.",
-    r"window\s*\.",
-    r"location\s*\.",
-    r"fetch\s*\(",
-    r"XMLHttpRequest",
-    r"innerHTML\s*=",
-    r"outerHTML\s*=",
-    r"document\.write",
-    r"script\s*>",
-]
-
-# HTML self-closing tags that don't need closing tags
-SELF_CLOSING_TAGS = {
-    "img",
-    "br",
-    "hr",
-    "input",
-    "meta",
-    "link",
-    "area",
-    "base",
-    "col",
-    "embed",
-    "source",
-    "track",
-    "wbr",
-}
-

 def validate_binary_data(data):
    """
@ -149,191 +60,21 @@ def validate_binary_data(data):
    return True, None


-def validate_html_content(html_content):
+def validate_html_content(html_content: str):
    """
-    Validate that HTML content is safe and doesn't contain malicious patterns.
-
-    Args:
-        html_content (str): The HTML content to validate
-
-    Returns:
-        tuple: (is_valid: bool, error_message: str or None)
+    Sanitize HTML content using nh3.
+    Returns a tuple: (is_valid, error_message, clean_html)
    """
    if not html_content:
-        return True, None  # Empty is OK
+        return True, None, None

    # Size check - 10MB limit (consistent with binary validation)
    if len(html_content.encode("utf-8")) > MAX_SIZE:
-        return False, "HTML content exceeds maximum size limit (10MB)"
-
-    # Check for specific malicious patterns (simplified and more reliable)
-    for pattern in MALICIOUS_HTML_PATTERNS:
-        if re.search(pattern, html_content, re.IGNORECASE | re.DOTALL):
-            return (
-                False,
-                f"HTML content contains potentially malicious patterns: {pattern}",
-            )
-
-    # Additional check for inline event handlers that contain suspicious content
-    # This is more permissive - only blocks if the event handler contains actual dangerous code
-    event_handler_pattern = r'on\w+\s*=\s*["\']([^"\']*)["\']'
-    event_matches = re.findall(event_handler_pattern, html_content, re.IGNORECASE)
-
-    for handler_content in event_matches:
-        for js_pattern in DANGEROUS_JS_PATTERNS:
-            if re.search(js_pattern, handler_content, re.IGNORECASE):
-                return (
-                    False,
-                    f"HTML content contains dangerous JavaScript in event handler: {handler_content[:100]}",
-                )
-
-
-    return True, None
-
-
-def validate_json_content(json_content):
-    """
-    Validate that JSON content is safe and doesn't contain malicious patterns.
-
-    Args:
-        json_content (dict): The JSON content to validate
-
-    Returns:
-        tuple: (is_valid: bool, error_message: str or None)
-    """
-    if not json_content:
-        return True, None  # Empty is OK
+        return False, "HTML content exceeds maximum size limit (10MB)", None

    try:
-        # Size check - 10MB limit (consistent with other validations)
-        json_str = json.dumps(json_content)
-        if len(json_str.encode("utf-8")) > MAX_SIZE:
-            return False, "JSON content exceeds maximum size limit (10MB)"
-
-        # Basic structure validation for page description JSON
-        if isinstance(json_content, dict):
-            # Check for expected page description structure
-            # This is based on ProseMirror/Tiptap JSON structure
-            if "type" in json_content and json_content.get("type") == "doc":
-                # Valid document structure
-                if "content" in json_content and isinstance(
-                    json_content["content"], list
-                ):
-                    # Recursively check content for suspicious patterns
-                    is_valid, error_msg = _validate_json_content_array(
-                        json_content["content"]
-                    )
-                    if not is_valid:
-                        return False, error_msg
-            elif "type" not in json_content and "content" not in json_content:
-                # Allow other JSON structures but validate for suspicious content
-                is_valid, error_msg = _validate_json_content_recursive(json_content)
-                if not is_valid:
-                    return False, error_msg
-        else:
-            return False, "JSON description must be a valid object"
-
-    except (TypeError, ValueError) as e:
-        return False, "Invalid JSON structure"
+        clean_html = nh3.clean(html_content)
+        return True, None, clean_html
    except Exception as e:
-        return False, "Failed to validate JSON content"
-
-    return True, None
-
-
-def _validate_json_content_array(content, depth=0):
-    """
-    Validate JSON content array for suspicious patterns.
-
-    Args:
-        content (list): Array of content nodes to validate
-        depth (int): Current recursion depth (default: 0)
-
-    Returns:
-        tuple: (is_valid: bool, error_message: str or None)
-    """
-    # Check recursion depth to prevent stack overflow
-    if depth > MAX_RECURSION_DEPTH:
-        return False, f"Maximum recursion depth ({MAX_RECURSION_DEPTH}) exceeded"
-
-    if not isinstance(content, list):
-        return True, None
-
-    for node in content:
-        if isinstance(node, dict):
-            # Check text content for suspicious patterns (more targeted)
-            if node.get("type") == "text" and "text" in node:
-                text_content = node["text"]
-                for pattern in DANGEROUS_TEXT_PATTERNS:
-                    if re.search(pattern, text_content, re.IGNORECASE):
-                        return (
-                            False,
-                            "JSON content contains suspicious script patterns in text",
-                        )
-
-            # Check attributes for suspicious content (more targeted)
-            if "attrs" in node and isinstance(node["attrs"], dict):
-                for attr_name, attr_value in node["attrs"].items():
-                    if isinstance(attr_value, str):
-                        # Only check specific attributes that could be dangerous
-                        if attr_name.lower() in [
-                            "href",
-                            "src",
-                            "action",
-                            "onclick",
-                            "onload",
-                            "onerror",
-                        ]:
-                            for pattern in DANGEROUS_ATTR_PATTERNS:
-                                if re.search(pattern, attr_value, re.IGNORECASE):
-                                    return (
-                                        False,
-                                        f"JSON content contains dangerous pattern in {attr_name} attribute",
-                                    )
-
-            # Recursively check nested content
-            if "content" in node and isinstance(node["content"], list):
-                is_valid, error_msg = _validate_json_content_array(
-                    node["content"], depth + 1
-                )
-                if not is_valid:
-                    return False, error_msg
-
-    return True, None
-
-
-def _validate_json_content_recursive(obj, depth=0):
-    """
-    Recursively validate JSON object for suspicious content.
-
-    Args:
-        obj: JSON object (dict, list, or primitive) to validate
-        depth (int): Current recursion depth (default: 0)
-
-    Returns:
-        tuple: (is_valid: bool, error_message: str or None)
-    """
-    # Check recursion depth to prevent stack overflow
-    if depth > MAX_RECURSION_DEPTH:
-        return False, f"Maximum recursion depth ({MAX_RECURSION_DEPTH}) exceeded"
-    if isinstance(obj, dict):
-        for key, value in obj.items():
-            if isinstance(value, str):
-                # Check for dangerous patterns using module constants
-                for pattern in DANGEROUS_TEXT_PATTERNS:
-                    if re.search(pattern, value, re.IGNORECASE):
-                        return (
-                            False,
-                            "JSON content contains suspicious script patterns",
-                        )
-            elif isinstance(value, (dict, list)):
-                is_valid, error_msg = _validate_json_content_recursive(value, depth + 1)
-                if not is_valid:
-                    return False, error_msg
-    elif isinstance(obj, list):
-        for item in obj:
-            is_valid, error_msg = _validate_json_content_recursive(item, depth + 1)
-            if not is_valid:
-                return False, error_msg
-
-    return True, None
+        log_exception(e)
+        return False, "Failed to sanitize HTML", None