[WIKI-553] chore: improved pages components tracking (#7966)
* chore: page components tracking * chore: changed the transaction task * chore: added logger for description html
This commit is contained in:
parent
5fa9943b66
commit
68aa2fe0b8
3 changed files with 129 additions and 53 deletions
|
|
@ -137,7 +137,11 @@ class PageViewSet(BaseViewSet):
|
||||||
if serializer.is_valid():
|
if serializer.is_valid():
|
||||||
serializer.save()
|
serializer.save()
|
||||||
# capture the page transaction
|
# capture the page transaction
|
||||||
page_transaction.delay(request.data, None, serializer.data["id"])
|
page_transaction.delay(
|
||||||
|
new_description_html=request.data.get("description_html", "<p></p>"),
|
||||||
|
old_description_html=None,
|
||||||
|
page_id=serializer.data["id"],
|
||||||
|
)
|
||||||
page = self.get_queryset().get(pk=serializer.data["id"])
|
page = self.get_queryset().get(pk=serializer.data["id"])
|
||||||
serializer = PageDetailSerializer(page)
|
serializer = PageDetailSerializer(page)
|
||||||
return Response(serializer.data, status=status.HTTP_201_CREATED)
|
return Response(serializer.data, status=status.HTTP_201_CREATED)
|
||||||
|
|
@ -168,11 +172,8 @@ class PageViewSet(BaseViewSet):
|
||||||
# capture the page transaction
|
# capture the page transaction
|
||||||
if request.data.get("description_html"):
|
if request.data.get("description_html"):
|
||||||
page_transaction.delay(
|
page_transaction.delay(
|
||||||
new_value=request.data,
|
new_description_html=request.data.get("description_html", "<p></p>"),
|
||||||
old_value=json.dumps(
|
old_description_html=page_description,
|
||||||
{"description_html": page_description},
|
|
||||||
cls=DjangoJSONEncoder,
|
|
||||||
),
|
|
||||||
page_id=page_id,
|
page_id=page_id,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -504,7 +505,11 @@ class PagesDescriptionViewSet(BaseViewSet):
|
||||||
if serializer.is_valid():
|
if serializer.is_valid():
|
||||||
# Capture the page transaction
|
# Capture the page transaction
|
||||||
if request.data.get("description_html"):
|
if request.data.get("description_html"):
|
||||||
page_transaction.delay(new_value=request.data, old_value=existing_instance, page_id=page_id)
|
page_transaction.delay(
|
||||||
|
new_description_html=request.data.get("description_html", "<p></p>"),
|
||||||
|
old_description_html=page.description_html,
|
||||||
|
page_id=page_id,
|
||||||
|
)
|
||||||
|
|
||||||
# Update the page using serializer
|
# Update the page using serializer
|
||||||
updated_page = serializer.save()
|
updated_page = serializer.save()
|
||||||
|
|
@ -550,7 +555,11 @@ class PageDuplicateEndpoint(BaseAPIView):
|
||||||
updated_by_id=page.updated_by_id,
|
updated_by_id=page.updated_by_id,
|
||||||
)
|
)
|
||||||
|
|
||||||
page_transaction.delay({"description_html": page.description_html}, None, page.id)
|
page_transaction.delay(
|
||||||
|
new_description_html=page.description_html,
|
||||||
|
old_description_html=None,
|
||||||
|
page_id=page.id,
|
||||||
|
)
|
||||||
|
|
||||||
# Copy the s3 objects uploaded in the page
|
# Copy the s3 objects uploaded in the page
|
||||||
copy_s3_objects_of_description_and_assets.delay(
|
copy_s3_objects_of_description_and_assets.delay(
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
# Python imports
|
# Python imports
|
||||||
import json
|
import logging
|
||||||
|
|
||||||
# Django imports
|
# Django imports
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
|
|
@ -7,72 +7,134 @@ from django.utils import timezone
|
||||||
# Third-party imports
|
# Third-party imports
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
# Module imports
|
# App imports
|
||||||
from plane.db.models import Page, PageLog
|
|
||||||
from celery import shared_task
|
from celery import shared_task
|
||||||
|
from plane.db.models import Page, PageLog
|
||||||
from plane.utils.exception_logger import log_exception
|
from plane.utils.exception_logger import log_exception
|
||||||
|
|
||||||
|
logger = logging.getLogger("plane.worker")
|
||||||
|
|
||||||
def extract_components(value, tag):
|
COMPONENT_MAP = {
|
||||||
|
"mention-component": {
|
||||||
|
"attributes": ["id", "entity_identifier", "entity_name", "entity_type"],
|
||||||
|
"extract": lambda m: {
|
||||||
|
"entity_name": m.get("entity_name"),
|
||||||
|
"entity_type": None,
|
||||||
|
"entity_identifier": m.get("entity_identifier"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"image-component": {
|
||||||
|
"attributes": ["id", "src"],
|
||||||
|
"extract": lambda m: {
|
||||||
|
"entity_name": "image",
|
||||||
|
"entity_type": None,
|
||||||
|
"entity_identifier": m.get("src"),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
component_map = {
|
||||||
|
**COMPONENT_MAP,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def extract_all_components(description_html):
|
||||||
|
"""
|
||||||
|
Extracts all component types from the HTML value in a single pass.
|
||||||
|
Returns a dict mapping component_type -> list of extracted entities.
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
mentions = []
|
if not description_html:
|
||||||
html = value.get("description_html")
|
return {component: [] for component in component_map.keys()}
|
||||||
soup = BeautifulSoup(html, "html.parser")
|
|
||||||
mention_tags = soup.find_all(tag)
|
|
||||||
|
|
||||||
for mention_tag in mention_tags:
|
soup = BeautifulSoup(description_html, "html.parser")
|
||||||
mention = {
|
results = {}
|
||||||
"id": mention_tag.get("id"),
|
|
||||||
"entity_identifier": mention_tag.get("entity_identifier"),
|
for component, config in component_map.items():
|
||||||
"entity_name": mention_tag.get("entity_name"),
|
attributes = config.get("attributes", ["id"])
|
||||||
}
|
component_tags = soup.find_all(component)
|
||||||
mentions.append(mention)
|
|
||||||
|
entities = []
|
||||||
|
for tag in component_tags:
|
||||||
|
entity = {attr: tag.get(attr) for attr in attributes}
|
||||||
|
entities.append(entity)
|
||||||
|
|
||||||
|
results[component] = entities
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
return mentions
|
|
||||||
except Exception:
|
except Exception:
|
||||||
return []
|
return {component: [] for component in component_map.keys()}
|
||||||
|
|
||||||
|
|
||||||
|
def get_entity_details(component: str, mention: dict):
|
||||||
|
"""
|
||||||
|
Normalizes mention attributes into entity_name, entity_type, entity_identifier.
|
||||||
|
"""
|
||||||
|
config = component_map.get(component)
|
||||||
|
if not config:
|
||||||
|
return {"entity_name": None, "entity_type": None, "entity_identifier": None}
|
||||||
|
return config["extract"](mention)
|
||||||
|
|
||||||
|
|
||||||
@shared_task
|
@shared_task
|
||||||
def page_transaction(new_value, old_value, page_id):
|
def page_transaction(new_description_html, old_description_html, page_id):
|
||||||
|
"""
|
||||||
|
Tracks changes in page content (mentions, embeds, etc.)
|
||||||
|
and logs them in PageLog for audit and reference.
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
page = Page.objects.get(pk=page_id)
|
page = Page.objects.get(pk=page_id)
|
||||||
new_page_mention = PageLog.objects.filter(page_id=page_id).exists()
|
|
||||||
|
|
||||||
old_value = json.loads(old_value) if old_value else {}
|
has_existing_logs = PageLog.objects.filter(page_id=page_id).exists()
|
||||||
|
|
||||||
|
|
||||||
|
# Extract all components in a single pass (optimized)
|
||||||
|
old_components = extract_all_components(old_description_html)
|
||||||
|
new_components = extract_all_components(new_description_html)
|
||||||
|
|
||||||
new_transactions = []
|
new_transactions = []
|
||||||
deleted_transaction_ids = set()
|
deleted_transaction_ids = set()
|
||||||
|
|
||||||
# TODO - Add "issue-embed-component", "img", "todo" components
|
for component in component_map.keys():
|
||||||
components = ["mention-component"]
|
old_entities = old_components[component]
|
||||||
for component in components:
|
new_entities = new_components[component]
|
||||||
old_mentions = extract_components(old_value, component)
|
|
||||||
new_mentions = extract_components(new_value, component)
|
|
||||||
|
|
||||||
new_mentions_ids = {mention["id"] for mention in new_mentions}
|
old_ids = {m.get("id") for m in old_entities if m.get("id")}
|
||||||
old_mention_ids = {mention["id"] for mention in old_mentions}
|
new_ids = {m.get("id") for m in new_entities if m.get("id")}
|
||||||
deleted_transaction_ids.update(old_mention_ids - new_mentions_ids)
|
deleted_transaction_ids.update(old_ids - new_ids)
|
||||||
|
|
||||||
new_transactions.extend(
|
for mention in new_entities:
|
||||||
PageLog(
|
mention_id = mention.get("id")
|
||||||
transaction=mention["id"],
|
if not mention_id or (mention_id in old_ids and has_existing_logs):
|
||||||
page_id=page_id,
|
continue
|
||||||
entity_identifier=mention["entity_identifier"],
|
|
||||||
entity_name=mention["entity_name"],
|
details = get_entity_details(component, mention)
|
||||||
workspace_id=page.workspace_id,
|
current_time = timezone.now()
|
||||||
created_at=timezone.now(),
|
|
||||||
updated_at=timezone.now(),
|
new_transactions.append(
|
||||||
|
PageLog(
|
||||||
|
transaction=mention_id,
|
||||||
|
page_id=page_id,
|
||||||
|
entity_identifier=details["entity_identifier"],
|
||||||
|
entity_name=details["entity_name"],
|
||||||
|
entity_type=details["entity_type"],
|
||||||
|
workspace_id=page.workspace_id,
|
||||||
|
created_at=current_time,
|
||||||
|
updated_at=current_time,
|
||||||
|
)
|
||||||
)
|
)
|
||||||
for mention in new_mentions
|
|
||||||
if mention["id"] not in old_mention_ids or not new_page_mention
|
|
||||||
|
# Bulk insert and cleanup
|
||||||
|
if new_transactions:
|
||||||
|
PageLog.objects.bulk_create(
|
||||||
|
new_transactions, batch_size=50, ignore_conflicts=True
|
||||||
)
|
)
|
||||||
|
|
||||||
# Create new PageLog objects for new transactions
|
if deleted_transaction_ids:
|
||||||
PageLog.objects.bulk_create(new_transactions, batch_size=10, ignore_conflicts=True)
|
PageLog.objects.filter(transaction__in=deleted_transaction_ids).delete()
|
||||||
|
|
||||||
# Delete the removed transactions
|
|
||||||
PageLog.objects.filter(transaction__in=deleted_transaction_ids).delete()
|
|
||||||
except Page.DoesNotExist:
|
except Page.DoesNotExist:
|
||||||
return
|
return
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,9 @@ import nh3
|
||||||
from plane.utils.exception_logger import log_exception
|
from plane.utils.exception_logger import log_exception
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger("plane.api")
|
||||||
|
|
||||||
# Maximum allowed size for binary data (10MB)
|
# Maximum allowed size for binary data (10MB)
|
||||||
MAX_SIZE = 10 * 1024 * 1024
|
MAX_SIZE = 10 * 1024 * 1024
|
||||||
|
|
@ -54,7 +56,9 @@ def validate_binary_data(data):
|
||||||
# Check for suspicious text patterns (HTML/JS)
|
# Check for suspicious text patterns (HTML/JS)
|
||||||
try:
|
try:
|
||||||
decoded_text = binary_data.decode("utf-8", errors="ignore")[:200]
|
decoded_text = binary_data.decode("utf-8", errors="ignore")[:200]
|
||||||
if any(pattern in decoded_text.lower() for pattern in SUSPICIOUS_BINARY_PATTERNS):
|
if any(
|
||||||
|
pattern in decoded_text.lower() for pattern in SUSPICIOUS_BINARY_PATTERNS
|
||||||
|
):
|
||||||
return False, "Binary data contains suspicious content patterns"
|
return False, "Binary data contains suspicious content patterns"
|
||||||
except Exception:
|
except Exception:
|
||||||
pass # Binary data might not be decodable as text, which is fine
|
pass # Binary data might not be decodable as text, which is fine
|
||||||
|
|
@ -232,8 +236,9 @@ def validate_html_content(html_content: str):
|
||||||
summary = json.dumps(diff)
|
summary = json.dumps(diff)
|
||||||
except Exception:
|
except Exception:
|
||||||
summary = str(diff)
|
summary = str(diff)
|
||||||
|
logger.warning(f"HTML sanitization removals: {summary}")
|
||||||
log_exception(
|
log_exception(
|
||||||
f"HTML sanitization removals: {summary}",
|
ValueError(f"HTML sanitization removals: {summary}"),
|
||||||
warning=True,
|
warning=True,
|
||||||
)
|
)
|
||||||
return True, None, clean_html
|
return True, None, clean_html
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue