diff --git a/dripper/process/html_utils.py b/dripper/process/html_utils.py
index 8419f05..05fcd07 100644
--- a/dripper/process/html_utils.py
+++ b/dripper/process/html_utils.py
@@ -5,6 +5,7 @@
and lxml HtmlElement objects.
"""
+import re
import html
from lxml import html as lxmlhtml
@@ -76,3 +77,19 @@ def element_to_html_unescaped(element: lxmlhtml.HtmlElement) -> str:
"""
s = element_to_html(element)
return html.unescape(s)
+
+
+def decode_http_urls_only(html_str):
+ """do not escape the url"""
+ def decode_match(match):
+ prefix = match.group(1) # href=" 或 src="
+ url = match.group(2)
+ suffix = match.group(3) # "
+
+ if url.startswith(('http://', 'https://', 'ftp://', '//')):
+ decoded_url = html.unescape(url)
+ return f'{prefix}{decoded_url}{suffix}'
+ return match.group(0)
+
+ pattern = r'(href="|src=")(.*?)(")'
+ return re.sub(pattern, decode_match, html_str, flags=re.IGNORECASE | re.DOTALL)
\ No newline at end of file
diff --git a/dripper/process/map_to_main.py b/dripper/process/map_to_main.py
index edda6ed..213bc4d 100644
--- a/dripper/process/map_to_main.py
+++ b/dripper/process/map_to_main.py
@@ -10,7 +10,7 @@
from lxml import html
from dripper.base import ITEM_ID_ATTR, TAIL_BLOCK_TAG, TagType
-from dripper.process.html_utils import element_to_html_unescaped, html_to_element
+from dripper.process.html_utils import element_to_html, html_to_element, decode_http_urls_only
def remove_recursive_by_condition(
@@ -85,4 +85,4 @@ def extract_main_html(map_html: str, response: dict) -> str:
for tail_block in root.xpath(f'//{TAIL_BLOCK_TAG}'):
tail_block.drop_tag()
- return element_to_html_unescaped(root)
+ return decode_http_urls_only(element_to_html(root))
diff --git a/dripper/process/simplify_html.py b/dripper/process/simplify_html.py
index 23ec5ae..7db071b 100644
--- a/dripper/process/simplify_html.py
+++ b/dripper/process/simplify_html.py
@@ -12,6 +12,7 @@
from bs4 import BeautifulSoup
from lxml import etree, html
+from selectolax.parser import HTMLParser
# Inline tags that should be treated as inline elements
inline_tags = {
@@ -23,6 +24,16 @@
# 'td', 'th' # Commented out: table cells are handled specially
}
+# Regarded as block level elements, there are no block level elements inside by default
+no_block_tags = {
+ "math"
+}
+
+# Labels that do not calculate text length
+no_calc_text_tags = {
+ "math", "table"
+}
+
# Tags to remove from HTML (navigation, metadata, etc.)
tags_to_remove = {
'head',
@@ -198,6 +209,8 @@ def extract_paragraphs(
def is_block_element(node) -> bool:
"""Determine if a node is a block-level element."""
# Handle special case for table cells
+ if node.tag in no_block_tags:
+ return False
if node.tag in ('td', 'th'):
# Find the nearest ancestor table element
table_ancestor = node
@@ -221,6 +234,8 @@ def is_block_element(node) -> bool:
def has_block_children(node) -> bool:
"""Determine if a node has block-level children."""
+ if node.tag in no_block_tags:
+ return False
return any(is_block_element(child) for child in node.iterchildren())
def clone_structure(
@@ -706,64 +721,179 @@ def remove_specific_elements(element):
if should_remove_element(element):
parent = element.getparent()
if parent is not None:
- parent.remove(element)
+ # parent.remove(element)
+ tail_text = element.tail or ""
+ element.tail = None
+
+ prev_sibling = element.getprevious()
+ if prev_sibling is not None:
+ if prev_sibling.tail is not None:
+ prev_sibling.tail += tail_text
+ else:
+ if prev_sibling.text is not None:
+ prev_sibling.text += tail_text
+ else:
+ prev_sibling.text = tail_text
+ else:
+ if parent.text is not None:
+ parent.text += tail_text
+ else:
+ parent.text = tail_text
+ parent.remove(element)
-def truncate_text_content(element, max_length=500):
+def truncate_html_element_selective(element, max_length, ellipsis='...', exclude_tags=None):
+ """
+ Truncate the text content within the lxml element and exclude the text within the specified label from being included in the length limit,
+ But the tail outside the specified label is included in the length limit, and an ellipsis is added after the text of the length limit,
+ And ensure that ellipses are not inserted inside the excluded labels.
+
+ params:
+ element (lxml.etree.Element): The lxml elements to be processed
+ max_length (int): Maximum allowed text length (excluding the length of specified labels)
+ ellipsis (str): The omitted symbol added after truncation defaults to '...'
+ exclude_tags (list): List of label names not included in length statistics, such as ['math ',' script ',' style ']
+
+ return:
+ lxml.etree.Element: Processed elements
"""
- Recursively process text content of element and its children.
+ if exclude_tags is None:
+ exclude_tags = set()
- Truncates when total length exceeds max_length while keeping tag structure intact.
+ def _calculate_text_length(node):
+ """Calculate the effective text length of the node and its descendants (excluding text within the specified label)"""
+ total_length = 0
- Args:
- element: HTML element to process
- max_length: Maximum total text length allowed
- """
- # First collect all text nodes (including text and tail)
- text_nodes = []
+ if node.text and not _is_excluded(node):
+ total_length += len(node.text)
- # Collect element's text
- if element.text and element.text.strip():
- text_nodes.append(('text', element, element.text))
+ for child in node:
+ total_length += _calculate_text_length(child)
- # Recursively process child elements
- for child in element:
- truncate_text_content(child, max_length)
- # Collect child's tail
- if child.tail and child.tail.strip():
- text_nodes.append(('tail', child, child.tail))
-
- # Calculate total text length under current element
- total_length = sum(len(text) for (typ, node, text) in text_nodes)
-
- # If total length doesn't exceed limit, return directly
- if total_length <= max_length:
- return
-
- # Otherwise perform truncation
- remaining = max_length
- for typ, node, text in text_nodes:
- if remaining <= 0:
- # Already reached limit, clear remaining text content
- if typ == 'text':
- node.text = None
- else:
- node.tail = None
- continue
+ if node.tail:
+ total_length += len(node.tail)
+ return total_length
+
+ def _is_excluded(node):
+ """Check if the node or its ancestor node is in the exclusion list"""
+ current = node
+ while current is not None:
+ if current.tag in exclude_tags:
+ return True
+ current = current.getparent()
+ return False
+
+ current_length = [0]
+ ellipsis_added = [False]
+ nodes_to_process = []
+
+ def _collect_text_nodes(node):
+ """
+ Recursive collection of all text node information that needs to be processed (including text and tail)
+ Simultaneously mark whether the node is allowed to be modified (not included in the exclusion label)
+ """
+ if node.text and not _is_excluded(node):
+ nodes_to_process.append({
+ 'type': 'text',
+ 'node': node,
+ 'original_text': node.text,
+ 'can_modify': not _is_inside_excluded_tag(node)
+ })
- if len(text) > remaining:
- # Need to truncate this text node
- if typ == 'text':
- node.text = text[:remaining] + '...'
+ for child in node:
+ _collect_text_nodes(child)
+
+ if node.tail:
+ nodes_to_process.append({
+ 'type': 'tail',
+ 'node': node,
+ 'original_text': node.tail,
+ 'can_modify': not _is_inside_excluded_tag(node)
+ })
+
+ def _is_inside_excluded_tag(node):
+ """Check if the node is located inside the excluded label"""
+ return _is_excluded(node.getparent()) if node.getparent() is not None else False
+
+ def _process_text_nodes():
+ """Process the collected text nodes, perform truncation and ellipsis addition"""
+ for node_info in nodes_to_process:
+ if ellipsis_added[0]:
+ if node_info['type'] == 'text':
+ node_info['node'].text = None
+ else:
+ node_info['node'].tail = None
+ continue
+
+ text_len = len(node_info['original_text'])
+ if current_length[0] + text_len <= max_length:
+ current_length[0] += text_len
else:
- node.tail = text[:remaining] + '...'
- remaining = 0
- else:
- remaining -= len(text)
+ if node_info['can_modify']:
+ remaining = max_length - current_length[0]
+ truncated_text = node_info['original_text'][:remaining] + ellipsis
+
+ if node_info['type'] == 'text':
+ node_info['node'].text = truncated_text
+ else:
+ node_info['node'].tail = truncated_text
+
+ current_length[0] = max_length
+ ellipsis_added[0] = True
+
+ _mark_truncation_point(node_info['node'])
+ else:
+ current_length[0] += text_len
+
+ def _mark_truncation_point(truncate_node):
+ """Mark truncation points and clean up subsequent content"""
+ parent = truncate_node.getparent()
+ if parent is not None:
+ children = list(parent)
+ try:
+ index = children.index(truncate_node)
+ for sibling in children[index + 1:]:
+ parent.remove(sibling)
+ except ValueError:
+ pass
+
+ _clean_ancestors_following_siblings(truncate_node)
+
+ def _clean_ancestors_following_siblings(node):
+ """Recursively clean up the subsequent sibling nodes of all ancestor nodes"""
+ parent = node.getparent()
+ if parent is None:
+ return
+
+ grandparent = parent.getparent()
+ if grandparent is None:
+ return
+
+ children = list(grandparent)
+ try:
+ index = children.index(parent)
+ for sibling in children[index + 1:]:
+ grandparent.remove(sibling)
+ except ValueError:
+ pass
+
+ _clean_ancestors_following_siblings(parent)
+
+ # 1. First, calculate the total text length
+ total_text_length = _calculate_text_length(element)
+
+ # 2. If the total length does not exceed the limit, return directly
+ if total_text_length <= max_length:
+ return element
+
+ # 3. Collect and process text nodes
+ _collect_text_nodes(element)
+ _process_text_nodes()
+ return element
def process_paragraphs(
- paragraphs: List[Dict[str, str]], uid_map: Dict[str, html.HtmlElement]
+ paragraphs: List[Dict[str, str]], uid_map: Dict[str, html.HtmlElement], cutoff_length: int = 500
) -> Tuple[str, html.HtmlElement]:
"""
Process paragraphs and add _item_id attributes.
@@ -800,7 +930,7 @@ def process_paragraphs(
continue
# Truncate overly long text content
- truncate_text_content(root, max_length=200)
+ truncate_html_element_selective(root, max_length=cutoff_length, exclude_tags=no_calc_text_tags)
# Add same _item_id to current paragraph and original element
current_id = str(item_id)
@@ -978,7 +1108,7 @@ def process_paragraphs(
return post_process_html(simplified_html)
-def simplify_html(html_str: str) -> Tuple[str, str]:
+def simplify_html(html_str: str, cutoff_length: int = 500) -> Tuple[str, str]:
"""
Simplify HTML structure and add item IDs.
@@ -997,8 +1127,13 @@ def simplify_html(html_str: str) -> Tuple[str, str]:
preprocessed_html = remove_xml_declaration(html_str)
# Fix unclosed tags using BeautifulSoup (lxml cannot fully fix them)
- soup = BeautifulSoup(preprocessed_html, 'html.parser')
- fixed_html = str(soup)
+ # Prioritize using Selectolax for better performance
+ try:
+ soup = HTMLParser(preprocessed_html)
+ fixed_html = soup.html
+ except Exception:
+ soup = BeautifulSoup(preprocessed_html, 'html.parser')
+ fixed_html = str(soup)
# Parse original DOM
original_dom = html.fromstring(fixed_html)
@@ -1017,7 +1152,7 @@ def simplify_html(html_str: str) -> Tuple[str, str]:
)
# Process paragraphs (synchronously add IDs)
- simplified_html = process_paragraphs(paragraphs, original_uid_map)
+ simplified_html = process_paragraphs(paragraphs, original_uid_map, cutoff_length)
remove_all_uids(original_dom)
original_html = etree.tostring(
diff --git a/requirements.txt b/requirements.txt
index a57dd50..d6c8a4a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
+selectolax
beautifulsoup4
fastapi
html2text