opendatalab · LollipopsAndWine · Jan 7, 2026
diff --git a/dripper/process/html_utils.py b/dripper/process/html_utils.py
@@ -5,6 +5,7 @@
 and lxml HtmlElement objects.
 """
 
+import re
 import html
 from lxml import html as lxmlhtml
 
@@ -76,3 +77,19 @@ def element_to_html_unescaped(element: lxmlhtml.HtmlElement) -> str:
     """
     s = element_to_html(element)
     return html.unescape(s)
+
+
+def decode_http_urls_only(html_str):
+    """do not escape the url"""
+    def decode_match(match):
+        prefix = match.group(1)  # href=" 或 src="
+        url = match.group(2)
+        suffix = match.group(3)  # "
+
+        if url.startswith(('http://', 'https://', 'ftp://', '//')):
+            decoded_url = html.unescape(url)
+            return f'{prefix}{decoded_url}{suffix}'
+        return match.group(0)
+
+    pattern = r'(href="|src=")(.*?)(")'
+    return re.sub(pattern, decode_match, html_str, flags=re.IGNORECASE | re.DOTALL)
diff --git a/dripper/process/map_to_main.py b/dripper/process/map_to_main.py
@@ -10,7 +10,7 @@
 from lxml import html
 
 from dripper.base import ITEM_ID_ATTR, TAIL_BLOCK_TAG, TagType
-from dripper.process.html_utils import element_to_html_unescaped, html_to_element
+from dripper.process.html_utils import element_to_html, html_to_element, decode_http_urls_only
 
 
 def remove_recursive_by_condition(
@@ -85,4 +85,4 @@ def extract_main_html(map_html: str, response: dict) -> str:
     for tail_block in root.xpath(f'//{TAIL_BLOCK_TAG}'):
         tail_block.drop_tag()
 
-    return element_to_html_unescaped(root)
+    return decode_http_urls_only(element_to_html(root))
diff --git a/dripper/process/simplify_html.py b/dripper/process/simplify_html.py
@@ -12,6 +12,7 @@
 
 from bs4 import BeautifulSoup
 from lxml import etree, html
+from selectolax.parser import HTMLParser
 
 # Inline tags that should be treated as inline elements
 inline_tags = {
@@ -23,6 +24,16 @@
     # 'td', 'th'  # Commented out: table cells are handled specially
 }
 
+# Regarded as block level elements, there are no block level elements inside by default
+no_block_tags = {
+    "math"
+}
+
+# Labels that do not calculate text length
+no_calc_text_tags = {
+    "math", "table"
+}
+
 # Tags to remove from HTML (navigation, metadata, etc.)
 tags_to_remove = {
     'head',
@@ -198,6 +209,8 @@ def extract_paragraphs(
     def is_block_element(node) -> bool:
         """Determine if a node is a block-level element."""
         # Handle special case for table cells
+        if node.tag in no_block_tags:
+            return False
         if node.tag in ('td', 'th'):
             # Find the nearest ancestor table element
             table_ancestor = node
@@ -221,6 +234,8 @@ def is_block_element(node) -> bool:
 
     def has_block_children(node) -> bool:
         """Determine if a node has block-level children."""
+        if node.tag in no_block_tags:
+            return False
         return any(is_block_element(child) for child in node.iterchildren())
 
     def clone_structure(
@@ -706,64 +721,179 @@ def remove_specific_elements(element):
     if should_remove_element(element):
         parent = element.getparent()
         if parent is not None:
-            parent.remove(element)
+            # parent.remove(element)
+            tail_text = element.tail or ""
+            element.tail = None
+
+            prev_sibling = element.getprevious()
+            if prev_sibling is not None:
+                if prev_sibling.tail is not None:
+                    prev_sibling.tail += tail_text
+                else:
+                    if prev_sibling.text is not None:
+                        prev_sibling.text += tail_text
+                    else:
+                        prev_sibling.text = tail_text
+            else:
+                if parent.text is not None:
+                    parent.text += tail_text
+                else:
+                    parent.text = tail_text
 
+            parent.remove(element)
 
-def truncate_text_content(element, max_length=500):
+def truncate_html_element_selective(element, max_length, ellipsis='...', exclude_tags=None):
+    """
+    Truncate the text content within the lxml element and exclude the text within the specified label from being included in the length limit,
+    But the tail outside the specified label is included in the length limit, and an ellipsis is added after the text of the length limit,
+    And ensure that ellipses are not inserted inside the excluded labels.
+
+    params:
+        element (lxml.etree.Element): The lxml elements to be processed
+        max_length (int): Maximum allowed text length (excluding the length of specified labels)
+        ellipsis (str): The omitted symbol added after truncation defaults to '...'
+        exclude_tags (list): List of label names not included in length statistics, such as ['math ',' script ',' style ']
+
+    return:
+        lxml.etree.Element: Processed elements
     """
-    Recursively process text content of element and its children.
+    if exclude_tags is None:
+        exclude_tags = set()
 
-    Truncates when total length exceeds max_length while keeping tag structure intact.
+    def _calculate_text_length(node):
+        """Calculate the effective text length of the node and its descendants (excluding text within the specified label)"""
+        total_length = 0
 
-    Args:
-        element: HTML element to process
-        max_length: Maximum total text length allowed
-    """
-    # First collect all text nodes (including text and tail)
-    text_nodes = []
+        if node.text and not _is_excluded(node):
+            total_length += len(node.text)
 
-    # Collect element's text
-    if element.text and element.text.strip():
-        text_nodes.append(('text', element, element.text))
+        for child in node:
+            total_length += _calculate_text_length(child)
 
-    # Recursively process child elements
-    for child in element:
-        truncate_text_content(child, max_length)
-        # Collect child's tail
-        if child.tail and child.tail.strip():
-            text_nodes.append(('tail', child, child.tail))
-
-    # Calculate total text length under current element
-    total_length = sum(len(text) for (typ, node, text) in text_nodes)
-
-    # If total length doesn't exceed limit, return directly
-    if total_length <= max_length:
-        return
-
-    # Otherwise perform truncation
-    remaining = max_length
-    for typ, node, text in text_nodes:
-        if remaining <= 0:
-            # Already reached limit, clear remaining text content
-            if typ == 'text':
-                node.text = None
-            else:
-                node.tail = None
-            continue
+        if node.tail:
+            total_length += len(node.tail)
+        return total_length
+
+    def _is_excluded(node):
+        """Check if the node or its ancestor node is in the exclusion list"""
+        current = node
+        while current is not None:
+            if current.tag in exclude_tags:
+                return True
+            current = current.getparent()
+        return False
+
+    current_length = [0]
+    ellipsis_added = [False]
+    nodes_to_process = []
+
+    def _collect_text_nodes(node):
+        """
+        Recursive collection of all text node information that needs to be processed (including text and tail)
+        Simultaneously mark whether the node is allowed to be modified (not included in the exclusion label)
+        """
+        if node.text and not _is_excluded(node):
+            nodes_to_process.append({
+                'type': 'text',
+                'node': node,
+                'original_text': node.text,
+                'can_modify': not _is_inside_excluded_tag(node)
+            })
 
-        if len(text) > remaining:
-            # Need to truncate this text node
-            if typ == 'text':
-                node.text = text[:remaining] + '...'
+        for child in node:
+            _collect_text_nodes(child)
+
+        if node.tail:
+            nodes_to_process.append({
+                'type': 'tail',
+                'node': node,
+                'original_text': node.tail,
+                'can_modify': not _is_inside_excluded_tag(node)
+            })
+
+    def _is_inside_excluded_tag(node):
+        """Check if the node is located inside the excluded label"""
+        return _is_excluded(node.getparent()) if node.getparent() is not None else False
+
+    def _process_text_nodes():
+        """Process the collected text nodes, perform truncation and ellipsis addition"""
+        for node_info in nodes_to_process:
+            if ellipsis_added[0]:
+                if node_info['type'] == 'text':
+                    node_info['node'].text = None
+                else:
+                    node_info['node'].tail = None
+                continue
+
+            text_len = len(node_info['original_text'])
+            if current_length[0] + text_len <= max_length:
+                current_length[0] += text_len
             else:
-                node.tail = text[:remaining] + '...'
-            remaining = 0
-        else:
-            remaining -= len(text)
+                if node_info['can_modify']:
+                    remaining = max_length - current_length[0]
+                    truncated_text = node_info['original_text'][:remaining] + ellipsis
+
+                    if node_info['type'] == 'text':
+                        node_info['node'].text = truncated_text
+                    else:
+                        node_info['node'].tail = truncated_text
+
+                    current_length[0] = max_length
+                    ellipsis_added[0] = True
+
+                    _mark_truncation_point(node_info['node'])
+                else:
+                    current_length[0] += text_len
+
+    def _mark_truncation_point(truncate_node):
+        """Mark truncation points and clean up subsequent content"""
+        parent = truncate_node.getparent()
+        if parent is not None:
+            children = list(parent)
+            try:
+                index = children.index(truncate_node)
+                for sibling in children[index + 1:]:
+                    parent.remove(sibling)
+            except ValueError:
+                pass
+
+        _clean_ancestors_following_siblings(truncate_node)
+
+    def _clean_ancestors_following_siblings(node):
+        """Recursively clean up the subsequent sibling nodes of all ancestor nodes"""
+        parent = node.getparent()
+        if parent is None:
+            return
+
+        grandparent = parent.getparent()
+        if grandparent is None:
+            return
+
+        children = list(grandparent)
+        try:
+            index = children.index(parent)
+            for sibling in children[index + 1:]:
+                grandparent.remove(sibling)
+        except ValueError:
+            pass
+
+        _clean_ancestors_following_siblings(parent)
+
+    # 1. First, calculate the total text length
+    total_text_length = _calculate_text_length(element)
+
+    # 2. If the total length does not exceed the limit, return directly
+    if total_text_length <= max_length:
+        return element
+
+    # 3. Collect and process text nodes
+    _collect_text_nodes(element)
+    _process_text_nodes()
 
+    return element
 
 def process_paragraphs(
-    paragraphs: List[Dict[str, str]], uid_map: Dict[str, html.HtmlElement]
+    paragraphs: List[Dict[str, str]], uid_map: Dict[str, html.HtmlElement], cutoff_length: int = 500
 ) -> Tuple[str, html.HtmlElement]:
     """
     Process paragraphs and add _item_id attributes.
@@ -800,7 +930,7 @@ def process_paragraphs(
                 continue
 
             # Truncate overly long text content
-            truncate_text_content(root, max_length=200)
+            truncate_html_element_selective(root, max_length=cutoff_length, exclude_tags=no_calc_text_tags)
 
             # Add same _item_id to current paragraph and original element
             current_id = str(item_id)
@@ -978,7 +1108,7 @@ def process_paragraphs(
     return post_process_html(simplified_html)
 
 
-def simplify_html(html_str: str) -> Tuple[str, str]:
+def simplify_html(html_str: str, cutoff_length: int = 500) -> Tuple[str, str]:
     """
     Simplify HTML structure and add item IDs.
 
@@ -997,8 +1127,13 @@ def simplify_html(html_str: str) -> Tuple[str, str]:
     preprocessed_html = remove_xml_declaration(html_str)
 
     # Fix unclosed tags using BeautifulSoup (lxml cannot fully fix them)
-    soup = BeautifulSoup(preprocessed_html, 'html.parser')
-    fixed_html = str(soup)
+    # Prioritize using Selectolax for better performance
+    try:
+        soup = HTMLParser(preprocessed_html)
+        fixed_html = soup.html
+    except Exception:
+        soup = BeautifulSoup(preprocessed_html, 'html.parser')
+        fixed_html = str(soup)
 
     # Parse original DOM
     original_dom = html.fromstring(fixed_html)
@@ -1017,7 +1152,7 @@ def simplify_html(html_str: str) -> Tuple[str, str]:
     )
 
     # Process paragraphs (synchronously add IDs)
-    simplified_html = process_paragraphs(paragraphs, original_uid_map)
+    simplified_html = process_paragraphs(paragraphs, original_uid_map, cutoff_length)
 
     remove_all_uids(original_dom)
     original_html = etree.tostring(

diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,4 @@
+selectolax
 beautifulsoup4
 fastapi
 html2text