diff --git a/dripper/process/html_utils.py b/dripper/process/html_utils.py index 8419f05..05fcd07 100644 --- a/dripper/process/html_utils.py +++ b/dripper/process/html_utils.py @@ -5,6 +5,7 @@ and lxml HtmlElement objects. """ +import re import html from lxml import html as lxmlhtml @@ -76,3 +77,19 @@ def element_to_html_unescaped(element: lxmlhtml.HtmlElement) -> str: """ s = element_to_html(element) return html.unescape(s) + + +def decode_http_urls_only(html_str): + """do not escape the url""" + def decode_match(match): + prefix = match.group(1) # href=" 或 src=" + url = match.group(2) + suffix = match.group(3) # " + + if url.startswith(('http://', 'https://', 'ftp://', '//')): + decoded_url = html.unescape(url) + return f'{prefix}{decoded_url}{suffix}' + return match.group(0) + + pattern = r'(href="|src=")(.*?)(")' + return re.sub(pattern, decode_match, html_str, flags=re.IGNORECASE | re.DOTALL) \ No newline at end of file diff --git a/dripper/process/map_to_main.py b/dripper/process/map_to_main.py index edda6ed..213bc4d 100644 --- a/dripper/process/map_to_main.py +++ b/dripper/process/map_to_main.py @@ -10,7 +10,7 @@ from lxml import html from dripper.base import ITEM_ID_ATTR, TAIL_BLOCK_TAG, TagType -from dripper.process.html_utils import element_to_html_unescaped, html_to_element +from dripper.process.html_utils import element_to_html, html_to_element, decode_http_urls_only def remove_recursive_by_condition( @@ -85,4 +85,4 @@ def extract_main_html(map_html: str, response: dict) -> str: for tail_block in root.xpath(f'//{TAIL_BLOCK_TAG}'): tail_block.drop_tag() - return element_to_html_unescaped(root) + return decode_http_urls_only(element_to_html(root)) diff --git a/dripper/process/simplify_html.py b/dripper/process/simplify_html.py index 23ec5ae..7db071b 100644 --- a/dripper/process/simplify_html.py +++ b/dripper/process/simplify_html.py @@ -12,6 +12,7 @@ from bs4 import BeautifulSoup from lxml import etree, html +from selectolax.parser import HTMLParser # Inline tags that should be treated as inline elements inline_tags = { @@ -23,6 +24,16 @@ # 'td', 'th' # Commented out: table cells are handled specially } +# Regarded as block level elements, there are no block level elements inside by default +no_block_tags = { + "math" +} + +# Labels that do not calculate text length +no_calc_text_tags = { + "math", "table" +} + # Tags to remove from HTML (navigation, metadata, etc.) tags_to_remove = { 'head', @@ -198,6 +209,8 @@ def extract_paragraphs( def is_block_element(node) -> bool: """Determine if a node is a block-level element.""" # Handle special case for table cells + if node.tag in no_block_tags: + return False if node.tag in ('td', 'th'): # Find the nearest ancestor table element table_ancestor = node @@ -221,6 +234,8 @@ def is_block_element(node) -> bool: def has_block_children(node) -> bool: """Determine if a node has block-level children.""" + if node.tag in no_block_tags: + return False return any(is_block_element(child) for child in node.iterchildren()) def clone_structure( @@ -706,64 +721,179 @@ def remove_specific_elements(element): if should_remove_element(element): parent = element.getparent() if parent is not None: - parent.remove(element) + # parent.remove(element) + tail_text = element.tail or "" + element.tail = None + + prev_sibling = element.getprevious() + if prev_sibling is not None: + if prev_sibling.tail is not None: + prev_sibling.tail += tail_text + else: + if prev_sibling.text is not None: + prev_sibling.text += tail_text + else: + prev_sibling.text = tail_text + else: + if parent.text is not None: + parent.text += tail_text + else: + parent.text = tail_text + parent.remove(element) -def truncate_text_content(element, max_length=500): +def truncate_html_element_selective(element, max_length, ellipsis='...', exclude_tags=None): + """ + Truncate the text content within the lxml element and exclude the text within the specified label from being included in the length limit, + But the tail outside the specified label is included in the length limit, and an ellipsis is added after the text of the length limit, + And ensure that ellipses are not inserted inside the excluded labels. + + params: + element (lxml.etree.Element): The lxml elements to be processed + max_length (int): Maximum allowed text length (excluding the length of specified labels) + ellipsis (str): The omitted symbol added after truncation defaults to '...' + exclude_tags (list): List of label names not included in length statistics, such as ['math ',' script ',' style '] + + return: + lxml.etree.Element: Processed elements """ - Recursively process text content of element and its children. + if exclude_tags is None: + exclude_tags = set() - Truncates when total length exceeds max_length while keeping tag structure intact. + def _calculate_text_length(node): + """Calculate the effective text length of the node and its descendants (excluding text within the specified label)""" + total_length = 0 - Args: - element: HTML element to process - max_length: Maximum total text length allowed - """ - # First collect all text nodes (including text and tail) - text_nodes = [] + if node.text and not _is_excluded(node): + total_length += len(node.text) - # Collect element's text - if element.text and element.text.strip(): - text_nodes.append(('text', element, element.text)) + for child in node: + total_length += _calculate_text_length(child) - # Recursively process child elements - for child in element: - truncate_text_content(child, max_length) - # Collect child's tail - if child.tail and child.tail.strip(): - text_nodes.append(('tail', child, child.tail)) - - # Calculate total text length under current element - total_length = sum(len(text) for (typ, node, text) in text_nodes) - - # If total length doesn't exceed limit, return directly - if total_length <= max_length: - return - - # Otherwise perform truncation - remaining = max_length - for typ, node, text in text_nodes: - if remaining <= 0: - # Already reached limit, clear remaining text content - if typ == 'text': - node.text = None - else: - node.tail = None - continue + if node.tail: + total_length += len(node.tail) + return total_length + + def _is_excluded(node): + """Check if the node or its ancestor node is in the exclusion list""" + current = node + while current is not None: + if current.tag in exclude_tags: + return True + current = current.getparent() + return False + + current_length = [0] + ellipsis_added = [False] + nodes_to_process = [] + + def _collect_text_nodes(node): + """ + Recursive collection of all text node information that needs to be processed (including text and tail) + Simultaneously mark whether the node is allowed to be modified (not included in the exclusion label) + """ + if node.text and not _is_excluded(node): + nodes_to_process.append({ + 'type': 'text', + 'node': node, + 'original_text': node.text, + 'can_modify': not _is_inside_excluded_tag(node) + }) - if len(text) > remaining: - # Need to truncate this text node - if typ == 'text': - node.text = text[:remaining] + '...' + for child in node: + _collect_text_nodes(child) + + if node.tail: + nodes_to_process.append({ + 'type': 'tail', + 'node': node, + 'original_text': node.tail, + 'can_modify': not _is_inside_excluded_tag(node) + }) + + def _is_inside_excluded_tag(node): + """Check if the node is located inside the excluded label""" + return _is_excluded(node.getparent()) if node.getparent() is not None else False + + def _process_text_nodes(): + """Process the collected text nodes, perform truncation and ellipsis addition""" + for node_info in nodes_to_process: + if ellipsis_added[0]: + if node_info['type'] == 'text': + node_info['node'].text = None + else: + node_info['node'].tail = None + continue + + text_len = len(node_info['original_text']) + if current_length[0] + text_len <= max_length: + current_length[0] += text_len else: - node.tail = text[:remaining] + '...' - remaining = 0 - else: - remaining -= len(text) + if node_info['can_modify']: + remaining = max_length - current_length[0] + truncated_text = node_info['original_text'][:remaining] + ellipsis + + if node_info['type'] == 'text': + node_info['node'].text = truncated_text + else: + node_info['node'].tail = truncated_text + + current_length[0] = max_length + ellipsis_added[0] = True + + _mark_truncation_point(node_info['node']) + else: + current_length[0] += text_len + + def _mark_truncation_point(truncate_node): + """Mark truncation points and clean up subsequent content""" + parent = truncate_node.getparent() + if parent is not None: + children = list(parent) + try: + index = children.index(truncate_node) + for sibling in children[index + 1:]: + parent.remove(sibling) + except ValueError: + pass + + _clean_ancestors_following_siblings(truncate_node) + + def _clean_ancestors_following_siblings(node): + """Recursively clean up the subsequent sibling nodes of all ancestor nodes""" + parent = node.getparent() + if parent is None: + return + + grandparent = parent.getparent() + if grandparent is None: + return + + children = list(grandparent) + try: + index = children.index(parent) + for sibling in children[index + 1:]: + grandparent.remove(sibling) + except ValueError: + pass + + _clean_ancestors_following_siblings(parent) + + # 1. First, calculate the total text length + total_text_length = _calculate_text_length(element) + + # 2. If the total length does not exceed the limit, return directly + if total_text_length <= max_length: + return element + + # 3. Collect and process text nodes + _collect_text_nodes(element) + _process_text_nodes() + return element def process_paragraphs( - paragraphs: List[Dict[str, str]], uid_map: Dict[str, html.HtmlElement] + paragraphs: List[Dict[str, str]], uid_map: Dict[str, html.HtmlElement], cutoff_length: int = 500 ) -> Tuple[str, html.HtmlElement]: """ Process paragraphs and add _item_id attributes. @@ -800,7 +930,7 @@ def process_paragraphs( continue # Truncate overly long text content - truncate_text_content(root, max_length=200) + truncate_html_element_selective(root, max_length=cutoff_length, exclude_tags=no_calc_text_tags) # Add same _item_id to current paragraph and original element current_id = str(item_id) @@ -978,7 +1108,7 @@ def process_paragraphs( return post_process_html(simplified_html) -def simplify_html(html_str: str) -> Tuple[str, str]: +def simplify_html(html_str: str, cutoff_length: int = 500) -> Tuple[str, str]: """ Simplify HTML structure and add item IDs. @@ -997,8 +1127,13 @@ def simplify_html(html_str: str) -> Tuple[str, str]: preprocessed_html = remove_xml_declaration(html_str) # Fix unclosed tags using BeautifulSoup (lxml cannot fully fix them) - soup = BeautifulSoup(preprocessed_html, 'html.parser') - fixed_html = str(soup) + # Prioritize using Selectolax for better performance + try: + soup = HTMLParser(preprocessed_html) + fixed_html = soup.html + except Exception: + soup = BeautifulSoup(preprocessed_html, 'html.parser') + fixed_html = str(soup) # Parse original DOM original_dom = html.fromstring(fixed_html) @@ -1017,7 +1152,7 @@ def simplify_html(html_str: str) -> Tuple[str, str]: ) # Process paragraphs (synchronously add IDs) - simplified_html = process_paragraphs(paragraphs, original_uid_map) + simplified_html = process_paragraphs(paragraphs, original_uid_map, cutoff_length) remove_all_uids(original_dom) original_html = etree.tostring( diff --git a/requirements.txt b/requirements.txt index a57dd50..d6c8a4a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +selectolax beautifulsoup4 fastapi html2text