diff --git a/llm_web_kit/main_html_parser/parser/tag_simplifier.py b/llm_web_kit/main_html_parser/parser/tag_simplifier.py index eede6cfb..493d46cc 100644 --- a/llm_web_kit/main_html_parser/parser/tag_simplifier.py +++ b/llm_web_kit/main_html_parser/parser/tag_simplifier.py @@ -19,12 +19,11 @@ def parse(self, pre_data: PreDataJson) -> PreDataJson: """ # 获取输入数据 typical_raw_html = pre_data.get(PreDataJsonKey.TYPICAL_RAW_HTML, '') - is_xpath = pre_data.get(PreDataJsonKey.IS_XPATH, True) # layout_file_list = pre_data.get(PreDataJsonKey.LAYOUT_FILE_LIST, []) # 执行HTML标签简化逻辑 try: - simplified_html, original_html, _ = simplify_html(typical_raw_html, is_xpath=is_xpath) + simplified_html, original_html = simplify_html(typical_raw_html) except TagSimplifiedParserException as e1: raise e1 except Exception as e2: @@ -33,6 +32,5 @@ def parse(self, pre_data: PreDataJson) -> PreDataJson: # 设置输出数据 pre_data[PreDataJsonKey.TYPICAL_RAW_TAG_HTML] = original_html # 保存原始标签HTML pre_data[PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML] = simplified_html # 保存简化后的HTML - pre_data[PreDataJsonKey.XPATH_MAPPING] = _ # 保存xpath return pre_data diff --git a/llm_web_kit/main_html_parser/simplify_html/simplify_html.py b/llm_web_kit/main_html_parser/simplify_html/simplify_html.py index bfff8f29..3c37da8e 100644 --- a/llm_web_kit/main_html_parser/simplify_html/simplify_html.py +++ b/llm_web_kit/main_html_parser/simplify_html/simplify_html.py @@ -3,33 +3,34 @@ import uuid from typing import Dict, List, Tuple -from bs4 import BeautifulSoup from lxml import etree, html +from selectolax.parser import HTMLParser # 行内标签 inline_tags = { 'map', 'optgroup', 'span', 'br', 'input', 'time', 'u', 'strong', 'textarea', 'small', 'sub', 'samp', 'blink', 'b', 'code', 'nobr', 'strike', 'bdo', 'basefont', 'abbr', 'var', 'i', 'cccode-inline', - 'select', 's', 'pic', 'label', 'mark', 'object', 'dd', 'dt', 'ccmath-inline', 'svg', 'li', + 'select', 's', 'pic', 'label', 'mark', 'object', 'ccmath-inline', 'svg', 'button', 'a', 'font', 'dfn', 'sup', 'kbd', 'q', 'script', 'acronym', 'option', 'img', 'big', 'cite', 'em', 'marked-tail', 'marked-text' - # 'td', 'th' + # 'td', 'th', 'dd', 'dt', 'li' } +# 表格内部可能包含的跟表格相关的标签 +table_tags_set = {"caption", "colgroup", "col", "thead", "tbody", "tfoot", "tr", "td", "th"} + # 需要删除的标签 tags_to_remove = { + 'title', 'head', - 'header', - 'footer', 'nav', - 'aside', 'style', 'script', 'noscript', 'link', 'meta', 'iframe', - 'frame' + 'frame', } # 需要保留的特殊标签(即使它们是行内标签) @@ -37,7 +38,7 @@ # 需要删除的属性名模式(独立单词) ATTR_PATTERNS_TO_REMOVE = { - 'nav', 'footer', 'header', # 独立单词 + 'nav', # 'footer', 'header', # 独立单词 } # 需要删除的属性名模式(特定前缀/后缀) @@ -72,90 +73,76 @@ def build_uid_map(dom: html.HtmlElement) -> Dict[str, html.HtmlElement]: return {node.get('data-uid'): node for node in dom.iter() if node.get('data-uid')} -def is_unique_attribute(tree, attr_name, attr_value): - """检查给定的属性名和值组合是否在文档中唯一。""" - elements = tree.xpath(f"//*[@{attr_name}='{attr_value}']") - return len(elements) == 1 - - -def get_relative_xpath(element): - root_tree = element.getroottree() - current_element = element - path_from_element = [] - found_unique_ancestor = False - - # 从当前元素开始向上查找 - while current_element is not None and current_element.getparent() is not None: - siblings = [sib for sib in current_element.getparent() if sib.tag == current_element.tag] - - # 检查当前元素是否有唯一属性 - unique_attr = None - candidate_attrs = [ - attr for attr in current_element.attrib - if not (attr.startswith('data-') or attr == 'style' or - attr == '_item_id' or - (current_element.attrib[attr].startswith('{') and current_element.attrib[attr].endswith('}'))) - ] - - for attr in candidate_attrs: - if is_unique_attribute(root_tree, attr, current_element.attrib[attr]): - unique_attr = attr +def judge_table_parent(table_element, node_list): + for node in node_list: + ancestor = node.getparent() + while ancestor is not None: + if ancestor is table_element: + return True + elif ancestor.tag == 'table': break - - # 如果有唯一属性,构建相对路径并停止向上查找 - if unique_attr is not None: - path_from_element.insert(0, f'*[@{unique_attr}="{current_element.attrib[unique_attr]}"]') - found_unique_ancestor = True - break - else: - # 没有唯一属性,使用常规方式 - if len(siblings) > 1: - index = siblings.index(current_element) + 1 - path_from_element.insert(0, f'{current_element.tag}[{index}]') - else: - path_from_element.insert(0, current_element.tag) - - current_element = current_element.getparent() - - # 构建最终的XPath - if found_unique_ancestor: - return f'//{"/".join(path_from_element)}' - else: - # 如果没有找到唯一属性祖先,返回完整路径 - return f'//{"/".join(path_from_element)}' + ancestor = ancestor.getparent() + return False def is_data_table(table_element: html.HtmlElement) -> bool: """判断表格是否是数据表格而非布局表格.""" - # 检查表格是否有 caption 标签 - if table_element.xpath('.//caption'): - return True - - # 检查是否有 th 标签 - if table_element.xpath('.//th'): + # 检查当前表格(不包括内部嵌套表格)是否有 caption 标签 + caption_nodes = table_element.xpath('.//caption') + if judge_table_parent(table_element, caption_nodes): return True - # 检查是否有 thead 或 tfoot 标签 - if table_element.xpath('.//thead') or table_element.xpath('.//tfoot'): + # 检查当前表格(不包括内部嵌套表格)是否有 colgroup 或 col 标签 + col_nodes = table_element.xpath('.//col') + colgroup_nodes = table_element.xpath('.//colgroup') + if judge_table_parent(table_element, col_nodes) or judge_table_parent(table_element, colgroup_nodes): return True - # 检查是否有 colgroup 或 col 标签 - if table_element.xpath('.//colgroup') or table_element.xpath('.//col'): - return True - - # 检查是否有 summary 属性 - if table_element.get('summary'): + # 检查当前表格(不包括内部嵌套表格)单元格是否有 headers 属性 + cell_nodes = table_element.xpath(".//*[self::td or self::th][@headers]") + if judge_table_parent(table_element, cell_nodes): return True # 检查是否有 role="table" 或 data-table 属性 if table_element.get('role') == 'table' or table_element.get('data-table'): return True - # 检查单元格是否有 headers 属性 - if table_element.xpath('.//*[@headers]'): + for node in table_element.iterdescendants(): + if node.tag in table_tags_set: + continue + if node.tag not in inline_tags: + return False + + return True + + +def has_non_listitem_children(list_element): + """检查列表元素是否包含非列表项的直接子节点. + + :param list_element: lxml元素对象 (ul, ol, dl) + :return: True 如果存在非列表项的直接子节点,否则 False + """ + + # 根据列表类型确定允许的子元素标签 + if list_element.tag in ['ul', 'ol']: + allowed_tags = {'li'} + elif list_element.tag == 'dl': + allowed_tags = {'dt', 'dd'} + + # 使用XPath直接查找是否存在不允许的直接子元素 + # 例如,对于