177 lines
5.5 KiB
Python
177 lines
5.5 KiB
Python
# Python imports
|
|
import logging
|
|
|
|
|
|
# Third party imports
|
|
from celery import shared_task
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from urllib.parse import urlparse, urljoin
|
|
import base64
|
|
import ipaddress
|
|
from typing import Dict, Any
|
|
from typing import Optional
|
|
from plane.db.models import IssueLink
|
|
from plane.utils.exception_logger import log_exception
|
|
|
|
logger = logging.getLogger("plane.worker")
|
|
|
|
|
|
DEFAULT_FAVICON = "PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIyNCIgaGVpZ2h0PSIyNCIgdmlld0JveD0iMCAwIDI0IDI0IiBmaWxsPSJub25lIiBzdHJva2U9ImN1cnJlbnRDb2xvciIgc3Ryb2tlLXdpZHRoPSIyIiBzdHJva2UtbGluZWNhcD0icm91bmQiIHN0cm9rZS1saW5lam9pbj0icm91bmQiIGNsYXNzPSJsdWNpZGUgbHVjaWRlLWxpbmstaWNvbiBsdWNpZGUtbGluayI+PHBhdGggZD0iTTEwIDEzYTUgNSAwIDAgMCA3LjU0LjU0bDMtM2E1IDUgMCAwIDAtNy4wNy03LjA3bC0xLjcyIDEuNzEiLz48cGF0aCBkPSJNMTQgMTFhNSA1IDAgMCAwLTcuNTQtLjU0bC0zIDNhNSA1IDAgMCAwIDcuMDcgNy4wN2wxLjcxLTEuNzEiLz48L3N2Zz4=" # noqa: E501
|
|
|
|
def crawl_work_item_link_title_and_favicon(url: str) -> Dict[str, Any]:
|
|
"""
|
|
Crawls a URL to extract the title and favicon.
|
|
|
|
Args:
|
|
url (str): The URL to crawl
|
|
|
|
Returns:
|
|
str: JSON string containing title and base64-encoded favicon
|
|
"""
|
|
try:
|
|
# Prevent access to private IP ranges
|
|
parsed = urlparse(url)
|
|
|
|
try:
|
|
ip = ipaddress.ip_address(parsed.hostname)
|
|
if ip.is_private or ip.is_loopback or ip.is_reserved:
|
|
raise ValueError("Access to private/internal networks is not allowed")
|
|
except ValueError:
|
|
# Not an IP address, continue with domain validation
|
|
pass
|
|
|
|
# Set up headers to mimic a real browser
|
|
headers = {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" # noqa: E501
|
|
}
|
|
|
|
soup = None
|
|
title = None
|
|
|
|
try:
|
|
response = requests.get(url, headers=headers, timeout=1)
|
|
|
|
soup = BeautifulSoup(response.content, "html.parser")
|
|
title_tag = soup.find("title")
|
|
title = title_tag.get_text().strip() if title_tag else None
|
|
|
|
except requests.RequestException as e:
|
|
logger.warning(f"Failed to fetch HTML for title: {str(e)}")
|
|
|
|
# Fetch and encode favicon
|
|
favicon_base64 = fetch_and_encode_favicon(headers, soup, url)
|
|
|
|
# Prepare result
|
|
result = {
|
|
"title": title,
|
|
"favicon": favicon_base64["favicon_base64"],
|
|
"url": url,
|
|
"favicon_url": favicon_base64["favicon_url"],
|
|
}
|
|
|
|
return result
|
|
|
|
except Exception as e:
|
|
log_exception(e)
|
|
return {
|
|
"error": f"Unexpected error: {str(e)}",
|
|
"title": None,
|
|
"favicon": None,
|
|
"url": url,
|
|
}
|
|
|
|
|
|
def find_favicon_url(soup: Optional[BeautifulSoup], base_url: str) -> Optional[str]:
|
|
"""
|
|
Find the favicon URL from HTML soup.
|
|
|
|
Args:
|
|
soup: BeautifulSoup object
|
|
base_url: Base URL for resolving relative paths
|
|
|
|
Returns:
|
|
str: Absolute URL to favicon or None
|
|
"""
|
|
|
|
if soup is not None:
|
|
# Look for various favicon link tags
|
|
favicon_selectors = [
|
|
'link[rel="icon"]',
|
|
'link[rel="shortcut icon"]',
|
|
'link[rel="apple-touch-icon"]',
|
|
'link[rel="apple-touch-icon-precomposed"]',
|
|
]
|
|
|
|
for selector in favicon_selectors:
|
|
favicon_tag = soup.select_one(selector)
|
|
if favicon_tag and favicon_tag.get("href"):
|
|
return urljoin(base_url, favicon_tag["href"])
|
|
|
|
# Fallback to /favicon.ico
|
|
parsed_url = urlparse(base_url)
|
|
fallback_url = f"{parsed_url.scheme}://{parsed_url.netloc}/favicon.ico"
|
|
|
|
# Check if fallback exists
|
|
try:
|
|
response = requests.head(fallback_url, timeout=2)
|
|
if response.status_code == 200:
|
|
return fallback_url
|
|
except requests.RequestException as e:
|
|
log_exception(e)
|
|
return None
|
|
|
|
return None
|
|
|
|
|
|
def fetch_and_encode_favicon(
|
|
headers: Dict[str, str], soup: Optional[BeautifulSoup], url: str
|
|
) -> Dict[str, Optional[str]]:
|
|
"""
|
|
Fetch favicon and encode it as base64.
|
|
|
|
Args:
|
|
favicon_url: URL to the favicon
|
|
headers: Request headers
|
|
|
|
Returns:
|
|
str: Base64 encoded favicon with data URI prefix or None
|
|
"""
|
|
try:
|
|
favicon_url = find_favicon_url(soup, url)
|
|
if favicon_url is None:
|
|
return {
|
|
"favicon_url": None,
|
|
"favicon_base64": f"data:image/svg+xml;base64,{DEFAULT_FAVICON}",
|
|
}
|
|
|
|
response = requests.get(favicon_url, headers=headers, timeout=1)
|
|
|
|
# Get content type
|
|
content_type = response.headers.get("content-type", "image/x-icon")
|
|
|
|
# Convert to base64
|
|
favicon_base64 = base64.b64encode(response.content).decode("utf-8")
|
|
|
|
# Return as data URI
|
|
return {
|
|
"favicon_url": favicon_url,
|
|
"favicon_base64": f"data:{content_type};base64,{favicon_base64}",
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Failed to fetch favicon: {e}")
|
|
return {
|
|
"favicon_url": None,
|
|
"favicon_base64": f"data:image/svg+xml;base64,{DEFAULT_FAVICON}",
|
|
}
|
|
|
|
|
|
@shared_task
|
|
def crawl_work_item_link_title(id: str, url: str) -> None:
|
|
meta_data = crawl_work_item_link_title_and_favicon(url)
|
|
issue_link = IssueLink.objects.get(id=id)
|
|
|
|
issue_link.metadata = meta_data
|
|
|
|
issue_link.save()
|