138 lines
4.3 KiB
Python
138 lines
4.3 KiB
Python
# Python imports
|
|
import logging
|
|
|
|
# Django imports
|
|
from django.utils import timezone
|
|
|
|
# Third-party imports
|
|
from bs4 import BeautifulSoup
|
|
|
|
# App imports
|
|
from celery import shared_task
|
|
from plane.db.models import Page, PageLog
|
|
from plane.utils.exception_logger import log_exception
|
|
|
|
logger = logging.getLogger("plane.worker")
|
|
|
|
COMPONENT_MAP = {
|
|
"mention-component": {
|
|
"attributes": ["id", "entity_identifier", "entity_name", "entity_type"],
|
|
"extract": lambda m: {
|
|
"entity_name": m.get("entity_name"),
|
|
"entity_type": None,
|
|
"entity_identifier": m.get("entity_identifier"),
|
|
},
|
|
},
|
|
"image-component": {
|
|
"attributes": ["id", "src"],
|
|
"extract": lambda m: {
|
|
"entity_name": "image",
|
|
"entity_type": None,
|
|
"entity_identifier": m.get("src"),
|
|
},
|
|
},
|
|
}
|
|
|
|
component_map = {
|
|
**COMPONENT_MAP,
|
|
}
|
|
|
|
|
|
def extract_all_components(description_html):
|
|
"""
|
|
Extracts all component types from the HTML value in a single pass.
|
|
Returns a dict mapping component_type -> list of extracted entities.
|
|
"""
|
|
try:
|
|
if not description_html:
|
|
return {component: [] for component in component_map.keys()}
|
|
|
|
soup = BeautifulSoup(description_html, "html.parser")
|
|
results = {}
|
|
|
|
for component, config in component_map.items():
|
|
attributes = config.get("attributes", ["id"])
|
|
component_tags = soup.find_all(component)
|
|
|
|
entities = []
|
|
for tag in component_tags:
|
|
entity = {attr: tag.get(attr) for attr in attributes}
|
|
entities.append(entity)
|
|
|
|
results[component] = entities
|
|
|
|
return results
|
|
|
|
except Exception:
|
|
return {component: [] for component in component_map.keys()}
|
|
|
|
|
|
def get_entity_details(component: str, mention: dict):
|
|
"""
|
|
Normalizes mention attributes into entity_name, entity_type, entity_identifier.
|
|
"""
|
|
config = component_map.get(component)
|
|
if not config:
|
|
return {"entity_name": None, "entity_type": None, "entity_identifier": None}
|
|
return config["extract"](mention)
|
|
|
|
|
|
@shared_task
|
|
def page_transaction(new_description_html, old_description_html, page_id):
|
|
"""
|
|
Tracks changes in page content (mentions, embeds, etc.)
|
|
and logs them in PageLog for audit and reference.
|
|
"""
|
|
try:
|
|
page = Page.objects.get(pk=page_id)
|
|
|
|
has_existing_logs = PageLog.objects.filter(page_id=page_id).exists()
|
|
|
|
# Extract all components in a single pass (optimized)
|
|
old_components = extract_all_components(old_description_html)
|
|
new_components = extract_all_components(new_description_html)
|
|
|
|
new_transactions = []
|
|
deleted_transaction_ids = set()
|
|
|
|
for component in component_map.keys():
|
|
old_entities = old_components[component]
|
|
new_entities = new_components[component]
|
|
|
|
old_ids = {m.get("id") for m in old_entities if m.get("id")}
|
|
new_ids = {m.get("id") for m in new_entities if m.get("id")}
|
|
deleted_transaction_ids.update(old_ids - new_ids)
|
|
|
|
for mention in new_entities:
|
|
mention_id = mention.get("id")
|
|
if not mention_id or (mention_id in old_ids and has_existing_logs):
|
|
continue
|
|
|
|
details = get_entity_details(component, mention)
|
|
current_time = timezone.now()
|
|
|
|
new_transactions.append(
|
|
PageLog(
|
|
transaction=mention_id,
|
|
page_id=page_id,
|
|
entity_identifier=details["entity_identifier"],
|
|
entity_name=details["entity_name"],
|
|
entity_type=details["entity_type"],
|
|
workspace_id=page.workspace_id,
|
|
created_at=current_time,
|
|
updated_at=current_time,
|
|
)
|
|
)
|
|
|
|
# Bulk insert and cleanup
|
|
if new_transactions:
|
|
PageLog.objects.bulk_create(new_transactions, batch_size=50, ignore_conflicts=True)
|
|
|
|
if deleted_transaction_ids:
|
|
PageLog.objects.filter(transaction__in=deleted_transaction_ids).delete()
|
|
|
|
except Page.DoesNotExist:
|
|
return
|
|
except Exception as e:
|
|
log_exception(e)
|
|
return
|