bb-plane-fork/apiserver/plane/bgtasks/work_item_link_task.py
Bavisetti Narayan 053c895120
[WEB 4252] chore: updated the favicon request for work item link (#7173)
* chore: added the favicon to link

* chore: added none validation for soup
2025-06-06 15:02:00 +05:30

177 lines
5.5 KiB
Python

# Python imports
import logging
# Third party imports
from celery import shared_task
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import base64
import ipaddress
from typing import Dict, Any
from typing import Optional
from plane.db.models import IssueLink
from plane.utils.exception_logger import log_exception
logger = logging.getLogger("plane.worker")
DEFAULT_FAVICON = "PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIyNCIgaGVpZ2h0PSIyNCIgdmlld0JveD0iMCAwIDI0IDI0IiBmaWxsPSJub25lIiBzdHJva2U9ImN1cnJlbnRDb2xvciIgc3Ryb2tlLXdpZHRoPSIyIiBzdHJva2UtbGluZWNhcD0icm91bmQiIHN0cm9rZS1saW5lam9pbj0icm91bmQiIGNsYXNzPSJsdWNpZGUgbHVjaWRlLWxpbmstaWNvbiBsdWNpZGUtbGluayI+PHBhdGggZD0iTTEwIDEzYTUgNSAwIDAgMCA3LjU0LjU0bDMtM2E1IDUgMCAwIDAtNy4wNy03LjA3bC0xLjcyIDEuNzEiLz48cGF0aCBkPSJNMTQgMTFhNSA1IDAgMCAwLTcuNTQtLjU0bC0zIDNhNSA1IDAgMCAwIDcuMDcgNy4wN2wxLjcxLTEuNzEiLz48L3N2Zz4=" # noqa: E501
def crawl_work_item_link_title_and_favicon(url: str) -> Dict[str, Any]:
"""
Crawls a URL to extract the title and favicon.
Args:
url (str): The URL to crawl
Returns:
str: JSON string containing title and base64-encoded favicon
"""
try:
# Prevent access to private IP ranges
parsed = urlparse(url)
try:
ip = ipaddress.ip_address(parsed.hostname)
if ip.is_private or ip.is_loopback or ip.is_reserved:
raise ValueError("Access to private/internal networks is not allowed")
except ValueError:
# Not an IP address, continue with domain validation
pass
# Set up headers to mimic a real browser
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" # noqa: E501
}
soup = None
title = None
try:
response = requests.get(url, headers=headers, timeout=1)
soup = BeautifulSoup(response.content, "html.parser")
title_tag = soup.find("title")
title = title_tag.get_text().strip() if title_tag else None
except requests.RequestException as e:
logger.warning(f"Failed to fetch HTML for title: {str(e)}")
# Fetch and encode favicon
favicon_base64 = fetch_and_encode_favicon(headers, soup, url)
# Prepare result
result = {
"title": title,
"favicon": favicon_base64["favicon_base64"],
"url": url,
"favicon_url": favicon_base64["favicon_url"],
}
return result
except Exception as e:
log_exception(e)
return {
"error": f"Unexpected error: {str(e)}",
"title": None,
"favicon": None,
"url": url,
}
def find_favicon_url(soup: Optional[BeautifulSoup], base_url: str) -> Optional[str]:
"""
Find the favicon URL from HTML soup.
Args:
soup: BeautifulSoup object
base_url: Base URL for resolving relative paths
Returns:
str: Absolute URL to favicon or None
"""
if soup is not None:
# Look for various favicon link tags
favicon_selectors = [
'link[rel="icon"]',
'link[rel="shortcut icon"]',
'link[rel="apple-touch-icon"]',
'link[rel="apple-touch-icon-precomposed"]',
]
for selector in favicon_selectors:
favicon_tag = soup.select_one(selector)
if favicon_tag and favicon_tag.get("href"):
return urljoin(base_url, favicon_tag["href"])
# Fallback to /favicon.ico
parsed_url = urlparse(base_url)
fallback_url = f"{parsed_url.scheme}://{parsed_url.netloc}/favicon.ico"
# Check if fallback exists
try:
response = requests.head(fallback_url, timeout=2)
if response.status_code == 200:
return fallback_url
except requests.RequestException as e:
log_exception(e)
return None
return None
def fetch_and_encode_favicon(
headers: Dict[str, str], soup: Optional[BeautifulSoup], url: str
) -> Dict[str, Optional[str]]:
"""
Fetch favicon and encode it as base64.
Args:
favicon_url: URL to the favicon
headers: Request headers
Returns:
str: Base64 encoded favicon with data URI prefix or None
"""
try:
favicon_url = find_favicon_url(soup, url)
if favicon_url is None:
return {
"favicon_url": None,
"favicon_base64": f"data:image/svg+xml;base64,{DEFAULT_FAVICON}",
}
response = requests.get(favicon_url, headers=headers, timeout=1)
# Get content type
content_type = response.headers.get("content-type", "image/x-icon")
# Convert to base64
favicon_base64 = base64.b64encode(response.content).decode("utf-8")
# Return as data URI
return {
"favicon_url": favicon_url,
"favicon_base64": f"data:{content_type};base64,{favicon_base64}",
}
except Exception as e:
logger.warning(f"Failed to fetch favicon: {e}")
return {
"favicon_url": None,
"favicon_base64": f"data:image/svg+xml;base64,{DEFAULT_FAVICON}",
}
@shared_task
def crawl_work_item_link_title(id: str, url: str) -> None:
meta_data = crawl_work_item_link_title_and_favicon(url)
issue_link = IssueLink.objects.get(id=id)
issue_link.metadata = meta_data
issue_link.save()