ccprocessor · e06084 · Sep 11, 2025 · Jun 9, 2025 · Jun 9, 2025 · Jun 9, 2025
diff --git a/llm_web_kit/extractor/html/recognizer/list.py b/llm_web_kit/extractor/html/recognizer/list.py
@@ -1,15 +1,18 @@
 import json
 from typing import Any, List, Tuple
 
+from lxml import html as lxml_html
 from lxml.html import HtmlElement
 from overrides import override
 
 from llm_web_kit.exception.exception import HtmlListRecognizerException
 from llm_web_kit.extractor.html.recognizer.recognizer import (
     BaseHTMLElementRecognizer, CCTag)
 from llm_web_kit.libs.doc_element_type import DocElementType, ParagraphTextType
-from llm_web_kit.libs.html_utils import (html_normalize_space,
-                                         process_sub_sup_tags)
+from llm_web_kit.libs.html_utils import (html_normalize_space, html_to_element,
+                                         process_sub_sup_tags,
+                                         replace_sub_sup_with_text_regex,
+                                         restore_sub_sup_from_text_regex)
 from llm_web_kit.libs.text_utils import normalize_text_segment
 
 from .text import inline_tags
@@ -224,7 +227,9 @@ def __get_list_content_list(self, ele: HtmlElement, list_nest_level: int) -> lis
         Returns:
             list: 包含列表项内容的列表，即items
         """
-
+        ele_html = lxml_html.tostring(ele, encoding='utf-8').decode()
+        replace_tree_html = replace_sub_sup_with_text_regex(ele_html)
+        ele = html_to_element(replace_tree_html)
         content_list = []
         # 处理根元素文本
         if ele.text and ele.text.strip():
@@ -239,6 +244,8 @@ def __get_list_content_list(self, ele: HtmlElement, list_nest_level: int) -> lis
         for child in ele.iterchildren():
             text_paragraph = self.__extract_list_item_text(child)
             if len(text_paragraph) > 0:
+                json_paragraph = restore_sub_sup_from_text_regex(json.dumps(text_paragraph))
+                text_paragraph = json.loads(json_paragraph)
                 content_list.extend(text_paragraph)
         return content_list
 

diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py
@@ -1,19 +1,44 @@
+import re
 from typing import Any, List, Tuple
 
+from lxml import html
 from lxml.html import HtmlElement
+from lxml.html.clean import Cleaner
 from overrides import override
 
 from llm_web_kit.exception.exception import HtmlTableRecognizerException
 from llm_web_kit.extractor.html.recognizer.ccmath import MathRecognizer
 from llm_web_kit.extractor.html.recognizer.recognizer import (
     BaseHTMLElementRecognizer, CCTag)
 from llm_web_kit.libs.doc_element_type import DocElementType
-from llm_web_kit.libs.html_utils import (html_normalize_space,
-                                         process_sub_sup_tags)
+from llm_web_kit.libs.html_utils import (element_to_html_unescaped,
+                                         html_normalize_space, html_to_element,
+                                         process_sub_sup_tags,
+                                         replace_sub_sup_with_text_regex,
+                                         restore_sub_sup_from_text_regex)
 from llm_web_kit.libs.text_utils import normalize_text_segment
 
 from .text import inline_tags
 
+new_inline_tags = inline_tags.union({'table', 'tr', 'td', 'th', 'thead', 'tbody', 'tfoot', 'caption'})
+
+allow_tags = ['table', 'tr', 'td', 'th', 'thead', 'tbody', 'tfoot', 'caption', 'sub', 'sup', 'ccmath-inline', 'ccmath-interline', 'cccode', 'cccode-inline']
+
+cleaner = Cleaner(
+    safe_attrs_only=False,
+    page_structure=False,
+    style=True,
+    scripts=True,
+    comments=True,
+    links=False,
+    meta=True,
+    embedded=True,
+    frames=True,
+    forms=True,
+    annoying_tags=True,
+    allow_tags=allow_tags
+)
+
 # 空元素
 VOID_ELEMENTS = {
     'area', 'base', 'br', 'col', 'embed', 'hr',
@@ -234,7 +259,7 @@ def process_node(node):
                     if node.tail and node.tail.strip():
                         result.append(node.tail.strip())
                 else:
-                    if node.tag == 'br' or node.tag not in inline_tags:
+                    if node.tag == 'br' or node.tag not in new_inline_tags:
                         result.append('\n\n')
 
                     # 提取当前节点的文本
@@ -247,7 +272,7 @@ def process_node(node):
                         process_node(child)
                     # 处理节点的tail（元素闭合后的文本）
                     if node.tail and node.tail.strip():
-                        if node.tag not in inline_tags:
+                        if node.tag not in new_inline_tags:
                             result.append('\n\n')
                         cleaned_tail = node.tail.strip()
                         result.append(html_normalize_space(cleaned_tail))
@@ -274,7 +299,10 @@ def __simplify_td_th_content(self, table_nest_level, elem: HtmlElement) -> None:
         else:
             math_res = self.__check_table_include_math_code(elem)
             elem.clear()
-            math_res_text = ' '.join(normalize_text_segment(item) for item in math_res)
+            if elem.tag not in new_inline_tags:
+                math_res_text = ' '.join(normalize_text_segment(item) for item in math_res) + "\n\n"
+            else:
+                math_res_text = ' '.join(normalize_text_segment(item) for item in math_res)
             if elem.tag in VOID_ELEMENTS:
                 elem_pre = elem.getprevious()
                 if elem_pre is not None:
@@ -292,22 +320,28 @@ def __get_table_body(self, table_type, table_nest_level, table_root):
         if table_type == 'empty':
             content = table_root.text_content()
             return content
+        table_html = html.tostring(table_root, encoding='utf-8').decode()
+        replace_tree_html = replace_sub_sup_with_text_regex(table_html)
+        table_root = html_to_element(replace_tree_html)
+
         # 清理除了colspan和rowspan之外的属性
         self.__simplify_td_th_content(table_nest_level, table_root)
         table_clean_attributes(table_root)
+        clean_html = cleaner.clean_html(self._element_to_html_entity(table_root))
+        new_table_root = self._build_html_tree(clean_html)
 
-        # doc = html.fromstring(html_content)
-        for element in table_root.iter():
+        pattern = re.compile(r'(\s*\n\s*\n\s*|\n{2,})')
+        for element in new_table_root.iter():
             # 清理元素前后的空白（不影响.text和.tail的内容）
             if element.text is not None:
-                element.text = element.text.lstrip('\n\t ')
+                element.text = re.sub(pattern, '\n\n', element.text.strip())
             if element.tail is not None:
-                if "\n\n" in element.tail:
-                    element.tail = "\n\n" + element.tail.lstrip('\n\t ')
-                else:
-                    element.tail = element.tail.lstrip('\n\t ')
+                element.tail = re.sub(pattern, '\n\n', element.tail.lstrip()).rstrip()
+
+        tree_html = element_to_html_unescaped(new_table_root)
+        restore_tree_html = restore_sub_sup_from_text_regex(tree_html)
 
-        return self._element_to_html_entity(table_root)
+        return restore_tree_html
 
     def __do_extract_tables(self, root: HtmlElement) -> None:
         """递归处理所有子标签."""

diff --git a/llm_web_kit/extractor/html/recognizer/text.py b/llm_web_kit/extractor/html/recognizer/text.py
@@ -13,7 +13,9 @@
 from llm_web_kit.libs.doc_element_type import DocElementType, ParagraphTextType
 from llm_web_kit.libs.html_utils import (element_to_html_unescaped,
                                          html_normalize_space, html_to_element,
-                                         process_sub_sup_tags)
+                                         process_sub_sup_tags,
+                                         replace_sub_sup_with_text_regex,
+                                         restore_sub_sup_from_text_regex)
 
 special_symbols = [  # TODO 从文件读取
     '®',  # 注册商标符号
@@ -65,7 +67,7 @@
     'samp', 'script', 'select', 'small', 'span', 'strong', 'sub', 'sup',
     'textarea', 'time', 'var', 'u', 's', 'cccode-inline', 'ccmath-inline',
     'marked-tail', 'marked-text', 'math','mspace', 'font', 'nobr', 'bdi',
-    'mjx-container', 'mjx-assistive-mml', 'strike', 'wbr', 'ins'
+    'mjx-container', 'mjx-assistive-mml', 'strike', 'wbr', 'ins', 'xhtml'
 }
 
 # 词间无分隔符的语言
@@ -205,7 +207,10 @@ def __combine_text(self, text1:str, text2:str, lang='en') -> str:
             if (text2 and text2[0] in string.punctuation) or (text2 and text2[0] in special_symbols) or (text2 and text2[0] in other_symbols) or (text1 and text1[-1] in other_symbols):
                 words_sep = ''
             else:
-                words_sep = ' '
+                if text2.startswith('tem_sub_') or text2.startswith('tem_sup_') or text1.endswith("tem_sub_start") or text1.endswith("tem_sup_start"):
+                    words_sep = ''
+                else:
+                    words_sep = ' '
             txt = text1 + words_sep + text2
             return self.replace_entities(txt.strip(), entities_map)
 
@@ -222,12 +227,13 @@ def __get_paragraph_text(self, root: HtmlElement, language:str = 'en') -> List[d
         Args:
             el: 代表一个段落的html元素
         """
+        _html = html.tostring(root, encoding='utf-8').decode()
+        replace_tree_html = replace_sub_sup_with_text_regex(_html)
+        root = html_to_element(replace_tree_html)
+
         para_text = []
 
         def __get_paragraph_text_recusive(el: HtmlElement, text: str) -> str:
-            # 标记当前元素是否是sub或sup类型
-            is_sub_sup = el.tag == 'sub' or el.tag == 'sup'
-
             if el.tag == CCTag.CC_MATH_INLINE:
                 if text:
                     para_text.append({'c': text, 't': ParagraphTextType.TEXT})
@@ -254,19 +260,17 @@ def __get_paragraph_text_recusive(el: HtmlElement, text: str) -> str:
 
             # 处理尾部文本
             if el.tail and el.tail.strip():
-                if is_sub_sup:
-                    _new_tail = html_normalize_space(el.tail.strip())
-                    text += _new_tail
-                else:
-                    _new_tail = html_normalize_space(el.tail.strip())
-                    new_tail = f' {_new_tail}' if el.tail.startswith(' ') and el.tail.strip()[0] in string.punctuation else _new_tail
-                    text = self.__combine_text(text, new_tail, language)
+                _new_tail = html_normalize_space(el.tail.strip())
+                new_tail = f' {_new_tail}' if el.tail.startswith(' ') and el.tail.strip()[0] in string.punctuation else _new_tail
+                text = self.__combine_text(text, new_tail, language)
 
             return text
 
         if final := __get_paragraph_text_recusive(root, ''):
             para_text.append({'c': final.replace('$br$', PARAGRAPH_SEPARATOR), 't': ParagraphTextType.TEXT})
 
+        for item in para_text:
+            item['c'] = restore_sub_sup_from_text_regex(item['c'])
         return para_text
 
     def __extract_paragraphs(self, root: HtmlElement):

diff --git a/llm_web_kit/extractor/html/recognizer/title.py b/llm_web_kit/extractor/html/recognizer/title.py
@@ -1,15 +1,17 @@
 from typing import List, Tuple
 
 # from lxml.etree import _Element as HtmlElement
+from lxml import html as lxml_html
 from lxml.html import HtmlElement
 from overrides import override
 
 from llm_web_kit.exception.exception import HtmlTitleRecognizerException
 from llm_web_kit.extractor.html.recognizer.recognizer import (
     BaseHTMLElementRecognizer, CCTag)
 from llm_web_kit.libs.doc_element_type import DocElementType
-from llm_web_kit.libs.html_utils import (html_normalize_space,
-                                         process_sub_sup_tags)
+from llm_web_kit.libs.html_utils import (html_normalize_space, html_to_element,
+                                         replace_sub_sup_with_text_regex,
+                                         restore_sub_sup_from_text_regex)
 
 from .text import PARAGRAPH_SEPARATOR
 
@@ -90,10 +92,14 @@ def __do_extract_title(self, root:HtmlElement) -> None:
         """
         # 匹配需要替换的标签
         if root.tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
+            if root.tail and root.tail.strip():
+                tail_text = root.tail.strip()
+            else:
+                tail_text = ''
+            root.tail = None
             title_text = self.__extract_title_text(root)
             title_raw_html = self._element_to_html(root)
             title_level = str(self.__extract_title_level(root.tag))
-            tail_text = root.tail
             cc_element = self._build_cc_element(CCTag.CC_TITLE, title_text, tail_text, level=title_level, html=title_raw_html)
             self._replace_element(root, cc_element)
             return
@@ -122,8 +128,9 @@ def __extract_title_text(self, header_el:HtmlElement) -> str:
         Returns:
             str: 标题的文本
         """
+        blks = []
+
         def __extract_title_text_recusive(el: HtmlElement, with_tail: bool = True) -> list[str]:
-            blks = []
 
             if el.tag == CCTag.CC_CODE_INLINE:
                 blks.append(f'`{el.text}`')
@@ -134,21 +141,18 @@ def __extract_title_text_recusive(el: HtmlElement, with_tail: bool = True) -> li
                     _new_text = html_normalize_space(el.text.strip())
                     blks.append(_new_text)
 
-            for child in el.getchildren():
-                if child.tag == 'sub' or child.tag == 'sup':
-                    blks.extend([process_sub_sup_tags(child, '', recursive=False), child.tail])
-                else:
-                    blks.extend(__extract_title_text_recusive(child))
-
             if with_tail:
                 blks.append((el.tail or '').strip())
 
             return blks
 
-        # 根元素不保留结尾
-        blks = __extract_title_text_recusive(header_el, False)
+        _html = lxml_html.tostring(header_el, encoding='utf-8').decode()
+        replace_tree_html = replace_sub_sup_with_text_regex(_html)
+        header_el = html_to_element(replace_tree_html)
 
-        return ' '.join(blk for blk in blks if blk).replace('$br$', PARAGRAPH_SEPARATOR)
+        for child in header_el.iter():
+            __extract_title_text_recusive(child, True)
+        return restore_sub_sup_from_text_regex(' '.join(blk for blk in blks if blk).replace('$br$', PARAGRAPH_SEPARATOR))
 
     def __get_attribute(self, html:HtmlElement) -> Tuple[int, str]:
         """获取element的属性."""

diff --git a/llm_web_kit/libs/html_utils.py b/llm_web_kit/libs/html_utils.py
@@ -452,6 +452,37 @@ def html_normalize_space(text: str) -> str:
         return text
 
 
+def replace_sub_sup_with_text_regex(html_content):
+    """使用正则表达式将 HTML 中的 <sub>、</sup> 标签替换为特殊标记。"""
+
+    def replacer(match):
+        tag = match.group(0).lower()
+        if tag.startswith('<sub'):
+            return 'tem_sub_start'
+        if tag == '</sub>':
+            return 'tem_sub_end'
+        if tag.startswith('<sup'):
+            return 'tem_sup_start'
+        if tag == '</sup>':
+            return 'tem_sup_end'
+
+    pattern = r'</?(?:sub|sup)\b[^>]*>'
+    return re.sub(pattern, replacer, html_content, flags=re.IGNORECASE)
+
+
+def restore_sub_sup_from_text_regex(processed_content):
+    """将<sub>、</sup>的替换标记还原为原始的 HTML 标签。"""
+    replacement_map = {
+        'tem_sub_start': '<sub>',
+        'tem_sub_end': '</sub>',
+        'tem_sup_start': '<sup>',
+        'tem_sup_end': '</sup>'
+    }
+
+    pattern = '|'.join(re.escape(key) for key in replacement_map.keys())
+    return re.sub(pattern, lambda m: replacement_map[m.group(0)], processed_content)
+
+
 def get_plain_text_fast(html_source: str) -> str:
     """使用lxml快速获取html中的纯文本.
 

diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.md b/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.md
@@ -4,7 +4,7 @@
 
 ### Use Integers for Index Variables
 
-In MATLAB<sup>®</sup>code that you want to convert to single precision, it is a best practice to use integers for index variables. However, if the code does not use integers for index variables, when possible `convertToSingle` tries to detect the index variables and select `int32` types for them.
+In MATLAB<sup>®</sup> code that you want to convert to single precision, it is a best practice to use integers for index variables. However, if the code does not use integers for index variables, when possible `convertToSingle` tries to detect the index variables and select `int32` types for them.
 
 ### Limit Use of `assert` Statements
 

diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.txt b/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.txt
@@ -1,7 +1,7 @@
 主要内容
 Single-Precision Conversion Best Practices
 Use Integers for Index Variables
-In MATLAB<sup>®</sup>code that you want to convert to single precision, it is a best practice to use integers for index variables. However, if the code does not use integers for index variables, when possible `convertToSingle` tries to detect the index variables and select `int32` types for them.
+In MATLAB<sup>®</sup> code that you want to convert to single precision, it is a best practice to use integers for index variables. However, if the code does not use integers for index variables, when possible `convertToSingle` tries to detect the index variables and select `int32` types for them.
 Limit Use of `assert` Statements
 - Do not use `assert` statements to define the properties of input arguments.
 - Do not use `assert` statements to test the type of a variable. For example, do not use