diff --git a/llm_web_kit/extractor/html/recognizer/list.py b/llm_web_kit/extractor/html/recognizer/list.py
index c3599fc8..61f113b8 100644
--- a/llm_web_kit/extractor/html/recognizer/list.py
+++ b/llm_web_kit/extractor/html/recognizer/list.py
@@ -1,6 +1,7 @@
 import json
 from typing import Any, List, Tuple
 
+from lxml import html as lxml_html
 from lxml.html import HtmlElement
 from overrides import override
 
@@ -8,8 +9,10 @@
 from llm_web_kit.extractor.html.recognizer.recognizer import (
     BaseHTMLElementRecognizer, CCTag)
 from llm_web_kit.libs.doc_element_type import DocElementType, ParagraphTextType
-from llm_web_kit.libs.html_utils import (html_normalize_space,
-                                         process_sub_sup_tags)
+from llm_web_kit.libs.html_utils import (html_normalize_space, html_to_element,
+                                         process_sub_sup_tags,
+                                         replace_sub_sup_with_text_regex,
+                                         restore_sub_sup_from_text_regex)
 from llm_web_kit.libs.text_utils import normalize_text_segment
 
 from .text import inline_tags
@@ -224,7 +227,9 @@ def __get_list_content_list(self, ele: HtmlElement, list_nest_level: int) -> lis
         Returns:
             list: 包含列表项内容的列表，即items
         """
-
+        ele_html = lxml_html.tostring(ele, encoding='utf-8').decode()
+        replace_tree_html = replace_sub_sup_with_text_regex(ele_html)
+        ele = html_to_element(replace_tree_html)
         content_list = []
         # 处理根元素文本
         if ele.text and ele.text.strip():
@@ -239,6 +244,8 @@ def __get_list_content_list(self, ele: HtmlElement, list_nest_level: int) -> lis
         for child in ele.iterchildren():
             text_paragraph = self.__extract_list_item_text(child)
             if len(text_paragraph) > 0:
+                json_paragraph = restore_sub_sup_from_text_regex(json.dumps(text_paragraph))
+                text_paragraph = json.loads(json_paragraph)
                 content_list.extend(text_paragraph)
         return content_list
 
diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py
index e4cc78ce..0d6e2b8e 100644
--- a/llm_web_kit/extractor/html/recognizer/table.py
+++ b/llm_web_kit/extractor/html/recognizer/table.py
@@ -1,6 +1,9 @@
+import re
 from typing import Any, List, Tuple
 
+from lxml import html
 from lxml.html import HtmlElement
+from lxml.html.clean import Cleaner
 from overrides import override
 
 from llm_web_kit.exception.exception import HtmlTableRecognizerException
@@ -8,12 +11,34 @@
 from llm_web_kit.extractor.html.recognizer.recognizer import (
     BaseHTMLElementRecognizer, CCTag)
 from llm_web_kit.libs.doc_element_type import DocElementType
-from llm_web_kit.libs.html_utils import (html_normalize_space,
-                                         process_sub_sup_tags)
+from llm_web_kit.libs.html_utils import (element_to_html_unescaped,
+                                         html_normalize_space, html_to_element,
+                                         process_sub_sup_tags,
+                                         replace_sub_sup_with_text_regex,
+                                         restore_sub_sup_from_text_regex)
 from llm_web_kit.libs.text_utils import normalize_text_segment
 
 from .text import inline_tags
 
+new_inline_tags = inline_tags.union({'table', 'tr', 'td', 'th', 'thead', 'tbody', 'tfoot', 'caption'})
+
+allow_tags = ['table', 'tr', 'td', 'th', 'thead', 'tbody', 'tfoot', 'caption', 'sub', 'sup', 'ccmath-inline', 'ccmath-interline', 'cccode', 'cccode-inline']
+
+cleaner = Cleaner(
+    safe_attrs_only=False,
+    page_structure=False,
+    style=True,
+    scripts=True,
+    comments=True,
+    links=False,
+    meta=True,
+    embedded=True,
+    frames=True,
+    forms=True,
+    annoying_tags=True,
+    allow_tags=allow_tags
+)
+
 # 空元素
 VOID_ELEMENTS = {
     'area', 'base', 'br', 'col', 'embed', 'hr',
@@ -234,7 +259,7 @@ def process_node(node):
                     if node.tail and node.tail.strip():
                         result.append(node.tail.strip())
                 else:
-                    if node.tag == 'br' or node.tag not in inline_tags:
+                    if node.tag == 'br' or node.tag not in new_inline_tags:
                         result.append('\n\n')
 
                     # 提取当前节点的文本
@@ -247,7 +272,7 @@ def process_node(node):
                         process_node(child)
                     # 处理节点的tail（元素闭合后的文本）
                     if node.tail and node.tail.strip():
-                        if node.tag not in inline_tags:
+                        if node.tag not in new_inline_tags:
                             result.append('\n\n')
                         cleaned_tail = node.tail.strip()
                         result.append(html_normalize_space(cleaned_tail))
@@ -274,7 +299,10 @@ def __simplify_td_th_content(self, table_nest_level, elem: HtmlElement) -> None:
         else:
             math_res = self.__check_table_include_math_code(elem)
             elem.clear()
-            math_res_text = ' '.join(normalize_text_segment(item) for item in math_res)
+            if elem.tag not in new_inline_tags:
+                math_res_text = ' '.join(normalize_text_segment(item) for item in math_res) + "\n\n"
+            else:
+                math_res_text = ' '.join(normalize_text_segment(item) for item in math_res)
             if elem.tag in VOID_ELEMENTS:
                 elem_pre = elem.getprevious()
                 if elem_pre is not None:
@@ -292,22 +320,28 @@ def __get_table_body(self, table_type, table_nest_level, table_root):
         if table_type == 'empty':
             content = table_root.text_content()
             return content
+        table_html = html.tostring(table_root, encoding='utf-8').decode()
+        replace_tree_html = replace_sub_sup_with_text_regex(table_html)
+        table_root = html_to_element(replace_tree_html)
+
         # 清理除了colspan和rowspan之外的属性
         self.__simplify_td_th_content(table_nest_level, table_root)
         table_clean_attributes(table_root)
+        clean_html = cleaner.clean_html(self._element_to_html_entity(table_root))
+        new_table_root = self._build_html_tree(clean_html)
 
-        # doc = html.fromstring(html_content)
-        for element in table_root.iter():
+        pattern = re.compile(r'(\s*\n\s*\n\s*|\n{2,})')
+        for element in new_table_root.iter():
             # 清理元素前后的空白（不影响.text和.tail的内容）
             if element.text is not None:
-                element.text = element.text.lstrip('\n\t ')
+                element.text = re.sub(pattern, '\n\n', element.text.strip())
             if element.tail is not None:
-                if "\n\n" in element.tail:
-                    element.tail = "\n\n" + element.tail.lstrip('\n\t ')
-                else:
-                    element.tail = element.tail.lstrip('\n\t ')
+                element.tail = re.sub(pattern, '\n\n', element.tail.lstrip()).rstrip()
+
+        tree_html = element_to_html_unescaped(new_table_root)
+        restore_tree_html = restore_sub_sup_from_text_regex(tree_html)
 
-        return self._element_to_html_entity(table_root)
+        return restore_tree_html
 
     def __do_extract_tables(self, root: HtmlElement) -> None:
         """递归处理所有子标签."""
diff --git a/llm_web_kit/extractor/html/recognizer/text.py b/llm_web_kit/extractor/html/recognizer/text.py
index 0052872c..6dc7e346 100644
--- a/llm_web_kit/extractor/html/recognizer/text.py
+++ b/llm_web_kit/extractor/html/recognizer/text.py
@@ -13,7 +13,9 @@
 from llm_web_kit.libs.doc_element_type import DocElementType, ParagraphTextType
 from llm_web_kit.libs.html_utils import (element_to_html_unescaped,
                                          html_normalize_space, html_to_element,
-                                         process_sub_sup_tags)
+                                         process_sub_sup_tags,
+                                         replace_sub_sup_with_text_regex,
+                                         restore_sub_sup_from_text_regex)
 
 special_symbols = [  # TODO 从文件读取
     '®',  # 注册商标符号
@@ -65,7 +67,7 @@
     'samp', 'script', 'select', 'small', 'span', 'strong', 'sub', 'sup',
     'textarea', 'time', 'var', 'u', 's', 'cccode-inline', 'ccmath-inline',
     'marked-tail', 'marked-text', 'math','mspace', 'font', 'nobr', 'bdi',
-    'mjx-container', 'mjx-assistive-mml', 'strike', 'wbr', 'ins'
+    'mjx-container', 'mjx-assistive-mml', 'strike', 'wbr', 'ins', 'xhtml'
 }
 
 # 词间无分隔符的语言
@@ -205,7 +207,10 @@ def __combine_text(self, text1:str, text2:str, lang='en') -> str:
             if (text2 and text2[0] in string.punctuation) or (text2 and text2[0] in special_symbols) or (text2 and text2[0] in other_symbols) or (text1 and text1[-1] in other_symbols):
                 words_sep = ''
             else:
-                words_sep = ' '
+                if text2.startswith('tem_sub_') or text2.startswith('tem_sup_') or text1.endswith("tem_sub_start") or text1.endswith("tem_sup_start"):
+                    words_sep = ''
+                else:
+                    words_sep = ' '
             txt = text1 + words_sep + text2
             return self.replace_entities(txt.strip(), entities_map)
 
@@ -222,12 +227,13 @@ def __get_paragraph_text(self, root: HtmlElement, language:str = 'en') -> List[d
         Args:
             el: 代表一个段落的html元素
         """
+        _html = html.tostring(root, encoding='utf-8').decode()
+        replace_tree_html = replace_sub_sup_with_text_regex(_html)
+        root = html_to_element(replace_tree_html)
+
         para_text = []
 
         def __get_paragraph_text_recusive(el: HtmlElement, text: str) -> str:
-            # 标记当前元素是否是sub或sup类型
-            is_sub_sup = el.tag == 'sub' or el.tag == 'sup'
-
             if el.tag == CCTag.CC_MATH_INLINE:
                 if text:
                     para_text.append({'c': text, 't': ParagraphTextType.TEXT})
@@ -254,19 +260,17 @@ def __get_paragraph_text_recusive(el: HtmlElement, text: str) -> str:
 
             # 处理尾部文本
             if el.tail and el.tail.strip():
-                if is_sub_sup:
-                    _new_tail = html_normalize_space(el.tail.strip())
-                    text += _new_tail
-                else:
-                    _new_tail = html_normalize_space(el.tail.strip())
-                    new_tail = f' {_new_tail}' if el.tail.startswith(' ') and el.tail.strip()[0] in string.punctuation else _new_tail
-                    text = self.__combine_text(text, new_tail, language)
+                _new_tail = html_normalize_space(el.tail.strip())
+                new_tail = f' {_new_tail}' if el.tail.startswith(' ') and el.tail.strip()[0] in string.punctuation else _new_tail
+                text = self.__combine_text(text, new_tail, language)
 
             return text
 
         if final := __get_paragraph_text_recusive(root, ''):
             para_text.append({'c': final.replace('$br$', PARAGRAPH_SEPARATOR), 't': ParagraphTextType.TEXT})
 
+        for item in para_text:
+            item['c'] = restore_sub_sup_from_text_regex(item['c'])
         return para_text
 
     def __extract_paragraphs(self, root: HtmlElement):
diff --git a/llm_web_kit/extractor/html/recognizer/title.py b/llm_web_kit/extractor/html/recognizer/title.py
index cf105b9f..d3205344 100644
--- a/llm_web_kit/extractor/html/recognizer/title.py
+++ b/llm_web_kit/extractor/html/recognizer/title.py
@@ -1,6 +1,7 @@
 from typing import List, Tuple
 
 # from lxml.etree import _Element as HtmlElement
+from lxml import html as lxml_html
 from lxml.html import HtmlElement
 from overrides import override
 
@@ -8,8 +9,9 @@
 from llm_web_kit.extractor.html.recognizer.recognizer import (
     BaseHTMLElementRecognizer, CCTag)
 from llm_web_kit.libs.doc_element_type import DocElementType
-from llm_web_kit.libs.html_utils import (html_normalize_space,
-                                         process_sub_sup_tags)
+from llm_web_kit.libs.html_utils import (html_normalize_space, html_to_element,
+                                         replace_sub_sup_with_text_regex,
+                                         restore_sub_sup_from_text_regex)
 
 from .text import PARAGRAPH_SEPARATOR
 
@@ -90,10 +92,14 @@ def __do_extract_title(self, root:HtmlElement) -> None:
         """
         # 匹配需要替换的标签
         if root.tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
+            if root.tail and root.tail.strip():
+                tail_text = root.tail.strip()
+            else:
+                tail_text = ''
+            root.tail = None
             title_text = self.__extract_title_text(root)
             title_raw_html = self._element_to_html(root)
             title_level = str(self.__extract_title_level(root.tag))
-            tail_text = root.tail
             cc_element = self._build_cc_element(CCTag.CC_TITLE, title_text, tail_text, level=title_level, html=title_raw_html)
             self._replace_element(root, cc_element)
             return
@@ -122,8 +128,9 @@ def __extract_title_text(self, header_el:HtmlElement) -> str:
         Returns:
             str: 标题的文本
         """
+        blks = []
+
         def __extract_title_text_recusive(el: HtmlElement, with_tail: bool = True) -> list[str]:
-            blks = []
 
             if el.tag == CCTag.CC_CODE_INLINE:
                 blks.append(f'`{el.text}`')
@@ -134,21 +141,18 @@ def __extract_title_text_recusive(el: HtmlElement, with_tail: bool = True) -> li
                     _new_text = html_normalize_space(el.text.strip())
                     blks.append(_new_text)
 
-            for child in el.getchildren():
-                if child.tag == 'sub' or child.tag == 'sup':
-                    blks.extend([process_sub_sup_tags(child, '', recursive=False), child.tail])
-                else:
-                    blks.extend(__extract_title_text_recusive(child))
-
             if with_tail:
                 blks.append((el.tail or '').strip())
 
             return blks
 
-        # 根元素不保留结尾
-        blks = __extract_title_text_recusive(header_el, False)
+        _html = lxml_html.tostring(header_el, encoding='utf-8').decode()
+        replace_tree_html = replace_sub_sup_with_text_regex(_html)
+        header_el = html_to_element(replace_tree_html)
 
-        return ' '.join(blk for blk in blks if blk).replace('$br$', PARAGRAPH_SEPARATOR)
+        for child in header_el.iter():
+            __extract_title_text_recusive(child, True)
+        return restore_sub_sup_from_text_regex(' '.join(blk for blk in blks if blk).replace('$br$', PARAGRAPH_SEPARATOR))
 
     def __get_attribute(self, html:HtmlElement) -> Tuple[int, str]:
         """获取element的属性."""
diff --git a/llm_web_kit/libs/html_utils.py b/llm_web_kit/libs/html_utils.py
index fede6cf0..05d6783b 100644
--- a/llm_web_kit/libs/html_utils.py
+++ b/llm_web_kit/libs/html_utils.py
@@ -452,6 +452,37 @@ def html_normalize_space(text: str) -> str:
         return text
 
 
+def replace_sub_sup_with_text_regex(html_content):
+    """使用正则表达式将 HTML 中的 <sub>、</sup> 标签替换为特殊标记。"""
+
+    def replacer(match):
+        tag = match.group(0).lower()
+        if tag.startswith('<sub'):
+            return 'tem_sub_start'
+        if tag == '</sub>':
+            return 'tem_sub_end'
+        if tag.startswith('<sup'):
+            return 'tem_sup_start'
+        if tag == '</sup>':
+            return 'tem_sup_end'
+
+    pattern = r'</?(?:sub|sup)\b[^>]*>'
+    return re.sub(pattern, replacer, html_content, flags=re.IGNORECASE)
+
+
+def restore_sub_sup_from_text_regex(processed_content):
+    """将<sub>、</sup>的替换标记还原为原始的 HTML 标签。"""
+    replacement_map = {
+        'tem_sub_start': '<sub>',
+        'tem_sub_end': '</sub>',
+        'tem_sup_start': '<sup>',
+        'tem_sup_end': '</sup>'
+    }
+
+    pattern = '|'.join(re.escape(key) for key in replacement_map.keys())
+    return re.sub(pattern, lambda m: replacement_map[m.group(0)], processed_content)
+
+
 def get_plain_text_fast(html_source: str) -> str:
     """使用lxml快速获取html中的纯文本.
 
diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.md b/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.md
index c8745567..bd8a9517 100644
--- a/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.md
+++ b/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.md
@@ -4,7 +4,7 @@
 
 ### Use Integers for Index Variables
 
-In MATLAB<sup>®</sup>code that you want to convert to single precision, it is a best practice to use integers for index variables. However, if the code does not use integers for index variables, when possible `convertToSingle` tries to detect the index variables and select `int32` types for them.
+In MATLAB<sup>®</sup> code that you want to convert to single precision, it is a best practice to use integers for index variables. However, if the code does not use integers for index variables, when possible `convertToSingle` tries to detect the index variables and select `int32` types for them.
 
 ### Limit Use of `assert` Statements
 
diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.txt b/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.txt
index 7b4f582d..2bdc3018 100644
--- a/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.txt
+++ b/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.txt
@@ -1,7 +1,7 @@
 主要内容
 Single-Precision Conversion Best Practices
 Use Integers for Index Variables
-In MATLAB<sup>®</sup>code that you want to convert to single precision, it is a best practice to use integers for index variables. However, if the code does not use integers for index variables, when possible `convertToSingle` tries to detect the index variables and select `int32` types for them.
+In MATLAB<sup>®</sup> code that you want to convert to single precision, it is a best practice to use integers for index variables. However, if the code does not use integers for index variables, when possible `convertToSingle` tries to detect the index variables and select `int32` types for them.
 Limit Use of `assert` Statements
 - Do not use `assert` statements to define the properties of input arguments.
 - Do not use `assert` statements to test the type of a variable. For example, do not use
diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py
index fb3ba4ff..c80d3568 100644
--- a/tests/llm_web_kit/extractor/html/recognizer/test_table.py
+++ b/tests/llm_web_kit/extractor/html/recognizer/test_table.py
@@ -66,7 +66,7 @@ def test_only_involve_table(self):
             parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html)
             self.assertEqual(len(parts), 2)
             table_body = parts[1][0].text_content()
-            assert table_body == '<table><tr><td><font>Mrs S Hindle</font></td></tr><tr><td><font>Show</font></td><td><font>CC</font></td><td><font>RCC</font></td></tr><tr><td><font>Driffield 5th October 2006</font></td><td><font>CH. Ricksbury Royal Hero</font></td><td><font>CH. Keyingham Branwell</font></td></tr><tr><td><font>Manchester 16th January 2008</font></td><td><font>CH. Lochbuie Geordie</font></td><td><font>Merryoth Maeve</font></td></tr><tr><td><font>Darlington 20th September 2009</font></td><td><font>CH. Maibee Make Believe</font></td><td><font>CH. Loranka Just Like Heaven JW</font></td></tr><tr><td><font>Blackpool 22nd June 2012</font></td><td><font>CH. Loranka Sherrie Baby</font></td><td><font>Dear Magic Touch De La Fi Au Songeur</font></td></tr><tr><td><font>Welsh Kennel Club 2014</font></td><td><font>Brymarden Carolina Sunrise</font></td><td><font>Ch. Wandris Evan Elp Us</font></td></tr><tr><td><font>Welsh Kennel Club 2014</font></td><td><font>Ch. Charnell Clematis of Salegreen</font></td><td><font>CH. Byermoor Queens Maid</font></td></tr></table>'
+            assert table_body == '<table><tr><td>Mrs S Hindle</td></tr><tr><td>Show</td><td>CC</td><td>RCC</td></tr><tr><td>Driffield 5th October 2006</td><td>CH. Ricksbury Royal Hero</td><td>CH. Keyingham Branwell</td></tr><tr><td>Manchester 16th January 2008</td><td>CH. Lochbuie Geordie</td><td>Merryoth Maeve</td></tr><tr><td>Darlington 20th September 2009</td><td>CH. Maibee Make Believe</td><td>CH. Loranka Just Like Heaven JW</td></tr><tr><td>Blackpool 22nd June 2012</td><td>CH. Loranka Sherrie Baby</td><td>Dear Magic Touch De La Fi Au Songeur</td></tr><tr><td>Welsh Kennel Club 2014</td><td>Brymarden Carolina Sunrise</td><td>Ch. Wandris Evan Elp Us</td></tr><tr><td>Welsh Kennel Club 2014</td><td>Ch. Charnell Clematis of Salegreen</td><td>CH. Byermoor Queens Maid</td></tr></table>'
 
     def test_table_include_img_label(self):
         """table是否包含img标签."""
@@ -89,7 +89,7 @@ def test_cc_simple_table(self):
             parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html)
             assert len(parts) == 3
             content = parts[1][0].text_content()
-            assert content == '<table><tbody><tr><td><strong>Рейтинг:</strong></td><td><div>Рейтинг 5.00 из 5 на основе опроса 3 пользователей</div></td></tr><tr><td><strong>Тип товара:</strong></td><td><span>Препараты для омоложения</span></td></tr><tr><td><strong>Форма:</strong></td><td>Крем</td></tr><tr><td><strong>Объем:</strong></td><td>50 мл</td></tr><tr><td><strong>Рецепт:</strong></td><td>Отпускается без рецепта</td></tr><tr><td><strong>Способ хранения:</strong></td><td>Хранить при температуре 4-20°</td></tr><tr><td><strong>Примечание:</strong></td><td>Беречь от детей</td></tr><tr><td><strong>Оплата:</strong></td><td>Наличными/банковской картой</td></tr><tr><td><strong>Доступность в Северске:</strong></td><td>В наличии</td></tr><tr><td><strong>Доставка:</strong></td><td>2-7 Дней</td></tr><tr><td><strong>Цена:</strong></td><td><span>84 ₽</span></td></tr></tbody></table>'
+            assert content == '<table><tbody><tr><td>Рейтинг:</td><td>Рейтинг 5.00 из 5 на основе опроса 3 пользователей</td></tr><tr><td>Тип товара:</td><td>Препараты для омоложения</td></tr><tr><td>Форма:</td><td>Крем</td></tr><tr><td>Объем:</td><td>50 мл</td></tr><tr><td>Рецепт:</td><td>Отпускается без рецепта</td></tr><tr><td>Способ хранения:</td><td>Хранить при температуре 4-20°</td></tr><tr><td>Примечание:</td><td>Беречь от детей</td></tr><tr><td>Оплата:</td><td>Наличными/банковской картой</td></tr><tr><td>Доступность в Северске:</td><td>В наличии</td></tr><tr><td>Доставка:</td><td>2-7 Дней</td></tr><tr><td>Цена:</td><td>84 ₽</td></tr></tbody></table>'
 
     def test_cc_complex_table(self):
         """cc跨行跨列的表格."""
@@ -100,7 +100,7 @@ def test_cc_complex_table(self):
             parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html)
             assert len(parts) == 3
             content = parts[1][0].text_content()
-            assert content == '<table><caption>ফেব্রুয়ারি ২০২৪</caption><thead><tr><th>সোম</th><th>মঙ্গল</th><th>বুধ</th><th>বৃহ</th><th>শুক্র</th><th>শনি</th><th>রবি</th></tr></thead>\n\n<tfoot><tr><td colspan="3"><a>« জানুয়ারি</a></td><td></td><td colspan="3"></td></tr></tfoot>\n\n<tbody><tr><td colspan="3"></td><td><a>১</a></td><td><a>২</a></td><td><a>৩</a></td><td><a>৪</a></td></tr><tr><td><a>৫</a></td><td><a>৬</a></td><td><a>৭</a></td><td><a>৮</a></td><td><a>৯</a></td><td><a>১০</a></td><td><a>১১</a></td></tr><tr><td><a>১২</a></td><td><a>১৩</a></td><td><a>১৪</a></td><td><a>১৫</a></td><td><a>১৬</a></td><td><a>১৭</a></td><td><a>১৮</a></td></tr><tr><td><a>১৯</a></td><td><a>২০</a></td><td>২১</td><td>২২</td><td>২৩</td><td>২৪</td><td>২৫</td></tr><tr><td>২৬</td><td>২৭</td><td>২৮</td><td>২৯</td><td colspan="3"></td></tr></tbody></table>'
+            assert content == '<table><caption>ফেব্রুয়ারি ২০২৪</caption><thead><tr><th>সোম</th><th>মঙ্গল</th><th>বুধ</th><th>বৃহ</th><th>শুক্র</th><th>শনি</th><th>রবি</th></tr></thead><tfoot><tr><td colspan="3">« জানুয়ারি</td><td></td><td colspan="3"></td></tr></tfoot><tbody><tr><td colspan="3"></td><td>১</td><td>২</td><td>৩</td><td>৪</td></tr><tr><td>৫</td><td>৬</td><td>৭</td><td>৮</td><td>৯</td><td>১০</td><td>১১</td></tr><tr><td>১২</td><td>১৩</td><td>১৪</td><td>১৫</td><td>১৬</td><td>১৭</td><td>১৮</td></tr><tr><td>১৯</td><td>২০</td><td>২১</td><td>২২</td><td>২৩</td><td>২৪</td><td>২৫</td></tr><tr><td>২৬</td><td>২৭</td><td>২৮</td><td>২৯</td><td colspan="3"></td></tr></tbody></table>'
             table_type = parts[1][0].xpath(f'.//{CCTag.CC_TABLE}')[0]
             assert table_type.attrib['table_type'] == 'complex'
 
@@ -155,10 +155,10 @@ def test_table_involve_equation(self):
             raw_html = raw_html_path.read_text(encoding='utf-8')
             parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html)
             complex_table_tag = parts[1][0].xpath(f'.//{CCTag.CC_TABLE}')
-            assert complex_table_tag[0].text == '<table><tbody><tr><th>Name of the probability distribution</th><th>Probability distribution function</th><th>Mean</th><th>Variance</th></tr><tr><td><a>Binomial distribution</a></td><td><span>${\\displaystyle \\Pr \\,(X=k)={\\binom {n}{k}}p^{k}(1-p)^{n-k}}$ \n\n</span></td><td><span>${\\displaystyle np}$ \n\n</span></td><th><span>${\\displaystyle np(1-p)}$ \n\n</span></th></tr><tr><td><a>Geometric distribution</a></td><td><span>${\\displaystyle \\Pr \\,(X=k)=(1-p)^{k-1}p}$ \n\n</span></td><td><span>${\\displaystyle {\\frac {1}{p}}}$ \n\n</span></td><th><span>${\\displaystyle {\\frac {(1-p)}{p^{2}}}}$ \n\n</span></th></tr><tr><td><a>Normal distribution</a></td><td><span>${\\displaystyle f\\left(x\\mid \\mu ,\\sigma ^{2}\\right)={\\frac {1}{\\sqrt {2\\pi \\sigma ^{2}}}}e^{-{\\frac {(x-\\mu )^{2}}{2\\sigma ^{2}}}}}$ \n\n</span></td><td><span>${\\displaystyle \\mu }$ \n\n</span></td><th><span>${\\displaystyle \\sigma ^{2}}$ \n\n</span></th></tr><tr><td><a>Uniform distribution (continuous)</a></td><td><span>${\\displaystyle f(x\\mid a,b)={\\begin{cases}{\\frac {1}{b-a}}&{\\text{for }}a\\leq x\\leq b,\\\\[3pt]0&{\\text{for }}x<a{\\text{ or }}x>b\\end{cases}}}$ \n\n</span></td><td><span>${\\displaystyle {\\frac {a+b}{2}}}$ \n\n</span></td><th><span>${\\displaystyle {\\frac {(b-a)^{2}}{12}}}$ \n\n</span></th></tr><tr><td><a>Exponential distribution</a></td><td><span>${\\displaystyle f(x\\mid \\lambda )=\\lambda e^{-\\lambda x}}$ \n\n</span></td><td><span>${\\displaystyle {\\frac {1}{\\lambda }}}$ \n\n</span></td><th><span>${\\displaystyle {\\frac {1}{\\lambda ^{2}}}}$ \n\n</span></th></tr><tr><td><a>Poisson distribution</a></td><td><span>${\\displaystyle f(k\\mid \\lambda )={\\frac {e^{-\\lambda }\\lambda ^{k}}{k!}}}$ \n\n</span></td><td><span>${\\displaystyle \\lambda }$ \n\n</span></td><th><span>${\\displaystyle \\lambda }$ \n\n</span></th></tr></tbody></table>'
+            assert complex_table_tag[0].text == '<table><tbody><tr><th>Name of the probability distribution</th><th>Probability distribution function</th><th>Mean</th><th>Variance</th></tr><tr><td>Binomial distribution</td><td>${\\displaystyle \\Pr \\,(X=k)={\\binom {n}{k}}p^{k}(1-p)^{n-k}}$</td><td>${\\displaystyle np}$</td><th>${\\displaystyle np(1-p)}$</th></tr><tr><td>Geometric distribution</td><td>${\\displaystyle \\Pr \\,(X=k)=(1-p)^{k-1}p}$</td><td>${\\displaystyle {\\frac {1}{p}}}$</td><th>${\\displaystyle {\\frac {(1-p)}{p^{2}}}}$</th></tr><tr><td>Normal distribution</td><td>${\\displaystyle f\\left(x\\mid \\mu ,\\sigma ^{2}\\right)={\\frac {1}{\\sqrt {2\\pi \\sigma ^{2}}}}e^{-{\\frac {(x-\\mu )^{2}}{2\\sigma ^{2}}}}}$</td><td>${\\displaystyle \\mu }$</td><th>${\\displaystyle \\sigma ^{2}}$</th></tr><tr><td>Uniform distribution (continuous)</td><td>${\\displaystyle f(x\\mid a,b)={\\begin{cases}{\\frac {1}{b-a}}&{\\text{for }}a\\leq x\\leq b,\\\\[3pt]0&{\\text{for }}xb\\end{cases}}}$</td><td>${\\displaystyle {\\frac {a+b}{2}}}$</td><th>${\\displaystyle {\\frac {(b-a)^{2}}{12}}}$</th></tr><tr><td>Exponential distribution</td><td>${\\displaystyle f(x\\mid \\lambda )=\\lambda e^{-\\lambda x}}$</td><td>${\\displaystyle {\\frac {1}{\\lambda }}}$</td><th>${\\displaystyle {\\frac {1}{\\lambda ^{2}}}}$</th></tr><tr><td>Poisson distribution</td><td>${\\displaystyle f(k\\mid \\lambda )={\\frac {e^{-\\lambda }\\lambda ^{k}}{k!}}}$</td><td>${\\displaystyle \\lambda }$</td><th>${\\displaystyle \\lambda }$</th></tr></tbody></table>'
             parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html)
             complex_table_tag = parts[1][0].xpath(f'.//{CCTag.CC_TABLE}')
-            assert complex_table_tag[0].text == '<table><tbody><tr><th>Name of the probability distribution</th><th>Probability distribution function</th><th>Mean</th><th>Variance</th></tr><tr><td><a>Binomial distribution</a></td><td><span>${\\displaystyle \\Pr \\,(X=k)={\\binom {n}{k}}p^{k}(1-p)^{n-k}}$ \n\n</span></td><td><span>${\\displaystyle np}$ \n\n</span></td><th><span>${\\displaystyle np(1-p)}$ \n\n</span></th></tr><tr><td><a>Geometric distribution</a></td><td><span>${\\displaystyle \\Pr \\,(X=k)=(1-p)^{k-1}p}$ \n\n</span></td><td><span>${\\displaystyle {\\frac {1}{p}}}$ \n\n</span></td><th><span>${\\displaystyle {\\frac {(1-p)}{p^{2}}}}$ \n\n</span></th></tr><tr><td><a>Normal distribution</a></td><td><span>${\\displaystyle f\\left(x\\mid \\mu ,\\sigma ^{2}\\right)={\\frac {1}{\\sqrt {2\\pi \\sigma ^{2}}}}e^{-{\\frac {(x-\\mu )^{2}}{2\\sigma ^{2}}}}}$ \n\n</span></td><td><span>${\\displaystyle \\mu }$ \n\n</span></td><th><span>${\\displaystyle \\sigma ^{2}}$ \n\n</span></th></tr><tr><td><a>Uniform distribution (continuous)</a></td><td><span>${\\displaystyle f(x\\mid a,b)={\\begin{cases}{\\frac {1}{b-a}}&{\\text{for }}a\\leq x\\leq b,\\\\[3pt]0&{\\text{for }}x<a{\\text{ or }}x>b\\end{cases}}}$ \n\n</span></td><td><span>${\\displaystyle {\\frac {a+b}{2}}}$ \n\n</span></td><th><span>${\\displaystyle {\\frac {(b-a)^{2}}{12}}}$ \n\n</span></th></tr><tr><td><a>Exponential distribution</a></td><td><span>${\\displaystyle f(x\\mid \\lambda )=\\lambda e^{-\\lambda x}}$ \n\n</span></td><td><span>${\\displaystyle {\\frac {1}{\\lambda }}}$ \n\n</span></td><th><span>${\\displaystyle {\\frac {1}{\\lambda ^{2}}}}$ \n\n</span></th></tr><tr><td><a>Poisson distribution</a></td><td><span>${\\displaystyle f(k\\mid \\lambda )={\\frac {e^{-\\lambda }\\lambda ^{k}}{k!}}}$ \n\n</span></td><td><span>${\\displaystyle \\lambda }$ \n\n</span></td><th><span>${\\displaystyle \\lambda }$ \n\n</span></th></tr></tbody></table>'
+            assert complex_table_tag[0].text == '<table><tbody><tr><th>Name of the probability distribution</th><th>Probability distribution function</th><th>Mean</th><th>Variance</th></tr><tr><td>Binomial distribution</td><td>${\\displaystyle \\Pr \\,(X=k)={\\binom {n}{k}}p^{k}(1-p)^{n-k}}$</td><td>${\\displaystyle np}$</td><th>${\\displaystyle np(1-p)}$</th></tr><tr><td>Geometric distribution</td><td>${\\displaystyle \\Pr \\,(X=k)=(1-p)^{k-1}p}$</td><td>${\\displaystyle {\\frac {1}{p}}}$</td><th>${\\displaystyle {\\frac {(1-p)}{p^{2}}}}$</th></tr><tr><td>Normal distribution</td><td>${\\displaystyle f\\left(x\\mid \\mu ,\\sigma ^{2}\\right)={\\frac {1}{\\sqrt {2\\pi \\sigma ^{2}}}}e^{-{\\frac {(x-\\mu )^{2}}{2\\sigma ^{2}}}}}$</td><td>${\\displaystyle \\mu }$</td><th>${\\displaystyle \\sigma ^{2}}$</th></tr><tr><td>Uniform distribution (continuous)</td><td>${\\displaystyle f(x\\mid a,b)={\\begin{cases}{\\frac {1}{b-a}}&{\\text{for }}a\\leq x\\leq b,\\\\[3pt]0&{\\text{for }}xb\\end{cases}}}$</td><td>${\\displaystyle {\\frac {a+b}{2}}}$</td><th>${\\displaystyle {\\frac {(b-a)^{2}}{12}}}$</th></tr><tr><td>Exponential distribution</td><td>${\\displaystyle f(x\\mid \\lambda )=\\lambda e^{-\\lambda x}}$</td><td>${\\displaystyle {\\frac {1}{\\lambda }}}$</td><th>${\\displaystyle {\\frac {1}{\\lambda ^{2}}}}$</th></tr><tr><td>Poisson distribution</td><td>${\\displaystyle f(k\\mid \\lambda )={\\frac {e^{-\\lambda }\\lambda ^{k}}{k!}}}$</td><td>${\\displaystyle \\lambda }$</td><th>${\\displaystyle \\lambda }$</th></tr></tbody></table>'
 
     def test_table_involve_after_code(self):
         """test table involve code, code被提取出去了，过滤掉空的和坏的table."""
@@ -203,7 +203,7 @@ def test_nested_table1(self):
         parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html)
         assert len(parts) == 3
         content = parts[2][0].text_content()
-        assert '<table><tr><td><form><table><tr><td><table><tr><td><label>Search APO</label></td></tr><tr><td><input><input>\n\n<br><a>Advanced Search</a></td></tr></table></td></tr></table></form>\n\n<table><tr><td><img></td>\n\n<td><a>Home</a></td></tr>\n\n<tr><td colspan="2"></td></tr>\n\n<tr><td colspan="2">Browse</td></tr>\n\n<tr><td><img></td><td><a>Communities \n\n & Collections</a></td></tr>\n\n<tr><td><img></td><td><a>Issue Date</a></td></tr>\n\n<tr><td><img></td><td><a>Author</a></td></tr>\n\n<tr><td><img></td><td><a>Title</a></td></tr>\n\n<tr><td><img></td><td><a>Subject</a></td></tr>\n\n<tr><td colspan="2"></td></tr>\n\n<tr><td colspan="2">Sign on to:</td></tr>\n\n<tr><td><img></td><td><a>Receive email \n\n updates</a></td></tr>\n\n<tr><td><img></td><td><a>My APO</a>\n\n<br><small>authorized users</small></td></tr>\n\n<tr><td><img></td><td><a>Edit Profile</a></td></tr>\n\n<tr><td colspan="2"></td></tr>\n\n<tr><td><img></td><td><script><!-- Javascript starts here\ndocument.write(\'<a href="#" onClick="var popupwin = window.open(\\\'/dspace/help/index.html\\\',\\\'dspacepopup\\\',\\\'height=600,width=550,resizable,scrollbars\\\');popupwin.focus();return false;">Help<\\/a>\');\n// --></script><noscript>Help</noscript></td></tr>\n\n<tr><td><img></td><td><a>About DSpace</a></td></tr></table>\n\n</td>\n\n<td><p>ANSTO Publications Online > \n\n Journal Publications > \n\n Journal Articles ></p><table><tr><td><strong>Please use this identifier to cite or link to this item: http://apo.ansto.gov.au/dspace/handle/10238/2935</strong></td>\n\n</tr></table>\n\n<br><center><table><tr><td>Title:</td><td>An investigation into transition metal ion binding properties of silk fibers and particles using radioisotopes.</td></tr><tr><td>Authors:</td><td><a>Rajkhowa, R</a>\n\n<br><a>Naik, R</a>\n\n<br><a>Wang, L</a>\n\n<br><a>Smith, SV</a>\n\n<br><a>Wang, X</a></td></tr><tr><td>Keywords:</td><td>Radioisotopes \n\n Transition Elements<br>\n\nBinding Energy<br>\n\nFibers<br>\n\nAbsorption<br>\n\nIons<br></td></tr><tr><td>Issue Date:</td><td>15-Mar-2011</td></tr><tr><td>Publisher:</td><td>Wiley-Blackwell</td></tr><tr><td>Citation:</td>' in content
+        assert '<table><tr><td><table><tr><td><table><tr><td>Search APO</td></tr><tr><td>Advanced Search</td></tr></table></td></tr></table><table><tr><td></td><td>Home</td></tr><tr><td colspan="2"></td></tr><tr><td colspan="2">Browse</td></tr><tr><td></td><td>Communities\n\n& Collections</td></tr><tr><td></td><td>Issue Date</td></tr><tr><td></td><td>Author</td></tr><tr><td></td><td>Title</td></tr><tr><td></td><td>Subject</td></tr><tr><td colspan="2"></td></tr><tr><td colspan="2">Sign on to:</td></tr><tr><td></td><td>Receive email\n\nupdates</td></tr><tr><td></td><td>My APO\n\nauthorized users</td></tr><tr><td></td><td>Edit Profile</td></tr><tr><td colspan="2"></td></tr><tr><td></td><td>Help</td></tr><tr><td></td><td>About DSpace</td></tr></table></td><td>ANSTO Publications Online >\n\nJournal Publications >\n\nJournal Articles ><table><tr><td>Please use this identifier to cite or link to this item: http://apo.ansto.gov.au/dspace/handle/10238/2935</td></tr></table><table><tr><td>Title:</td><td>An investigation into transition metal ion binding properties of silk fibers and particles using radioisotopes.</td></tr><tr><td>Authors:</td><td>Rajkhowa, R\n\nNaik, R\n\nWang, L\n\nSmith, SV\n\nWang, X</td></tr><tr><td>Keywords:</td><td>Radioisotopes\n\nTransition Elements\n\nBinding Energy\n\nFibers\n\nAbsorption\n\nIons</td></tr><tr><td>Issue Date:</td><td>15-Mar-2011</td></tr><tr><td>Publisher:</td><td>Wiley-Blackwell</td>' in content
 
     def test_nested_table2(self):
         """复杂嵌套表格."""
@@ -213,7 +213,7 @@ def test_nested_table2(self):
         parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html)
         assert len(parts) == 2
         content = parts[1][0].text_content()
-        assert '<table><tr><td colspan="2"><div>jQuery(document).ready( function($) { if ($(\'#gateway-page\').length) { jQuery("body").addClass("fontyourface layout-one-sidebar layout-sidebar-first wide hff-43 pff-43 sff-43 slff-43 fixed-header-enabled slideout-side-right transparent-header-active path-node page-node-type-page"); }}); \n\n .acalog-custom .region--light-typography.region--dark-background a {font-weight:normal;} .acalog-custom ul.icons-list {margin:0} .acalog-custom ul.icons-list li {margin:5px 12px 5px 0;} #gateway-footer-copyright {background:#f6f8f9; font-family:\'Libre Franklin\', Helvetica Neue, Arial, sans-serif; padding:20px;} \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n window.dataLayer = window.dataLayer || []; function gtag(){dataLayer.push(arguments);} gtag(\'js\', new Date()); gtag(\'config\', \'G-L4J2WT8RM8\'); \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n Main Numbers: \n\n (615) 452-8600 \n\n (888) 335-8722 \n\n \n\n \n\n \n\n \n\n  facebook \n\n  instagram \n\n  twitter \n\n  youtube \n\n \n\n \n\n \n\n \n\n Campuses \n\n \n\n Gallatin \n\n Cookeville \n\n Livingston \n\n Springfield \n\n \n\n \n\n \n\n \n\n \n\n Academic Divisions \n\n \n\n Business & Technology \n\n Health Sciences \n\n Humanities & Fine Arts \n\n Mathematics & Science \n\n Nursing \n\n Social Science & Education \n\n \n\n \n\n \n\n \n\n \n\n Resources \n\n \n\n Accreditation \n\n Bookstore \n\n Campus Police \n\n Contact Us \n\n Employee Directory \n\n IT Help Desk \n\n Library \n\n Marketing & Communications</div></td></tr><tr><td></td><td><span>Volunteer State Community College</span></td></tr><tr><td></td><td><table><tr><td></td><td><span>May 24, 2024</span></td><td></td><td><table><tr><td><span>2013-2014 VSCC Catalog</span></td><td><form><table><tr><td><div>Select a Catalog \n\n 2024-2025 Undergraduate Catalog \n\n 2023-2024 Undergraduate Catalog [ARCHIVED CATALOG] \n\n 2022-2023' in content
+        assert '<table><tr><td colspan="2">jQuery(document).ready( function($) { if ($(\'#gateway-page\').length) { jQuery("body").addClass("fontyourface layout-one-sidebar layout-sidebar-first wide hff-43 pff-43 sff-43 slff-43 fixed-header-enabled slideout-side-right transparent-header-active path-node page-node-type-page"); }});\n\n.acalog-custom .region--light-typography.region--dark-background a {font-weight:normal;} .acalog-custom ul.icons-list {margin:0} .acalog-custom ul.icons-list li {margin:5px 12px 5px 0;} #gateway-footer-copyright {background:#f6f8f9; font-family:\'Libre Franklin\', Helvetica Neue, Arial, sans-serif; padding:20px;}\n\nwindow.dataLayer = window.dataLayer || []; function gtag(){dataLayer.push(arguments);} gtag(\'js\', new Date()); gtag(\'config\', \'G-L4J2WT8RM8\');\n\nMain Numbers:\n\n(615) 452-8600\n\n(888) 335-8722\n\nfacebook\n\ninstagram\n\ntwitter\n\nyoutube\n\nCampuses\n\nGallatin\n\nCookeville\n\nLivingston\n\nSpringfield\n\nAcademic Divisions\n\nBusiness & Technology\n\nHealth Sciences\n\nHumanities & Fine Arts\n\nMathematics & Science\n\nNursing\n\nSocial Science & Education\n\nResources\n\nAccreditation\n\nBookstore\n\nCampus Police\n\nContact Us\n\nEmployee Directory\n\nIT Help Desk\n\nLibrary\n\nMarketing & Communications</td></tr><tr><td></td><td>Volunteer State Community College</td></tr><tr><td></td><td><table><tr><td></td><td>May 24, 2024</td><td></td><td><table><tr><td>2013-2014 VSCC Catalog</td><td><table><tr><td>Select a Catalog\n\n2024-2025 Undergraduate Catalog\n\n2023-2024 Undergraduate Catalog [ARCHIVED CATALOG]\n\n2022-2023' in content
 
     def test_nested_table3(self):
         """复杂嵌套表格."""
@@ -223,7 +223,7 @@ def test_nested_table3(self):
         parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html)
         assert len(parts) == 3
         content = parts[2][0].text_content()
-        assert "<table><tr><td><table><tr><td><div>What's New - Recent Content \n\n \n\n Members' Peak Updates \n\n Recent Trip Reports \n\n Recent Trip Report Comments \n\n Recently added Images \n\n Recently added Peaks \n\n List Completers \n\n \n\n Height List Completers \n\n Elevation List Completers \n\n County Summit Completers \n\n Wilderness Area Completers \n\n Member Profiles & Stats \n\n \n\n Member Profiles - Summary Stats \n\n Member Stats by Date Range & Charts \n\n Calendar Grid Completions \n\n Peaks Repeated \n\n Most Climbed Peaks \n\n Unclimbed Peaks \n\n US Peak Totals by State \n\n Member Tools \n\n \n\n Closest 50 Peaks by Member \n\n \n\n Closest 50 Map \n\n Closest 50 List \n\n Download your Peak List \n\n Search Trip Reports \n\n Unclimbed by Custom Group \n\n Export CSV, GPX, POI, TOPO! Files \n\n Elevation Threshold Progress Maps \n\n State Highest # Progress Maps \n\n County Summit Progress Maps \n\n Statewide County Summit Maps \n\n Prominence Progress Maps \n\n State Quads Progress Maps \n\n Quadrangle Lookup \n\n Distance Calculator \n\n Slope Angle Calculator \n\n Stats Category Leaders \n\n US Highest 1,000 Peaks \n\n \n\n US Highest 1,000 Member Area \n\n 1,000 Highest Peak List \n\n US Steepest 1,000 Peaks \n\n \n\n Steepness Member Area \n\n View 1,000 Steepest List \n\n US 2,000' Prominence \n\n \n\n US Prominence Member Area \n\n View US Prominence Peak Profiles \n\n View Member 5k Completion Maps \n\n Prominence Progress Maps \n\n US County Highpoints \n\n \n\n County Highpoints Member Area \n\n Highpoint Profiles - By State \n\n View Member's Completion Maps \n\n US State Highpoints \n\n \n\n US State Highpoints Member Area \n\n View State Highpoints List \n\n View Member's Completion Maps \n\n US Wilderness Area Peaks \n\n \n\n Wilderness Summits Member Area \n\n Wilderness Area Detail by State \n\n Wilderness HPs Member Area \n\n US National Park Peaks \n\n \n\n National Park Peaks Member Area \n\n National Park Peaks Detail by State" in content
+        assert "<table><tr><td><table><tr><td>What's New - Recent Content\n\nMembers' Peak Updates\n\nRecent Trip Reports\n\nRecent Trip Report Comments\n\nRecently added Images\n\nRecently added Peaks\n\nList Completers\n\nHeight List Completers\n\nElevation List Completers\n\nCounty Summit Completers\n\nWilderness Area Completers\n\nMember Profiles & Stats\n\nMember Profiles - Summary Stats\n\nMember Stats by Date Range & Charts\n\nCalendar Grid Completions\n\nPeaks Repeated\n\nMost Climbed Peaks\n\nUnclimbed Peaks\n\nUS Peak Totals by State\n\nMember Tools\n\nClosest 50 Peaks by Member\n\nClosest 50 Map\n\nClosest 50 List\n\nDownload your Peak List\n\nSearch Trip Reports\n\nUnclimbed by Custom Group\n\nExport CSV, GPX, POI, TOPO! Files\n\nElevation Threshold Progress Maps\n\nState Highest # Progress Maps\n\nCounty Summit Progress Maps\n\nStatewide County Summit Maps\n\nProminence Progress Maps\n\nState Quads Progress Maps\n\nQuadrangle Lookup\n\nDistance Calculator\n\nSlope Angle Calculator\n\nStats Category Leaders\n\nUS Highest 1,000 Peaks\n\nUS Highest 1,000 Member Area\n\n1,000 Highest Peak List\n\nUS Steepest 1,000 Peaks\n\nSteepness Member Area\n\nView 1,000 Steepest List\n\nUS 2,000' Prominence\n\nUS Prominence Member Area\n\nView US Prominence Peak Profiles\n\nView Member 5k Completion Maps\n\nProminence Progress Maps\n\nUS County Highpoints\n\nCounty Highpoints Member Area\n\nHighpoint Profiles - By State\n\nView Member's Completion Maps\n\nUS State Highpoints\n\nUS State Highpoints Member Area\n\nView State Highpoints List\n\nView Member's Completion Maps\n\nUS Wilderness Area Peaks\n\nWilderness Summits Member Area\n\nWilderness Area Detail by State" in content
 
     def test_nested_table4(self):
         """复杂嵌套表格."""
@@ -233,4 +233,4 @@ def test_nested_table4(self):
         parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html)
         assert len(parts) == 4
         content = parts[2][0].text_content()
-        assert '<table><tr><td><h1>Molecular line emissions from pre main sequence objects</h1><div><div></div><div></div><div></div><div><p>Saraceno, P. ; Benedettini, M. ; Caux, E. ; Ceccarelli, M. C. ; Clegg, P. E. ; Correia, J. C. ; di Giorgio, A. M. ; Giannini, T. ; Griffin, M. J. ; Leeks, S. J. ; Liseau, R. ; Lorenzetti, D. ; Molinari, S. ; Nisini, B. ; Smith, H. ; Spinoglio, L. ; Tomassi, E. and White, G. J. (1997). \n\n Molecular line emissions from pre main sequence objects. \n\n In: The first ISO workshop on Analytical Spectroscopy , 6-8 October 1997, Madrid, Spain, p. 291. \n\n Full text available as:</p><table><tr><td><a></a><div><table><tr><td><img><div>Preview</div></td></tr></table></div>\n\n</td><td><span>PDF (Version of Record) - Requires a PDF viewer such as</span><a>GSview ,</a><a>Xpdf or</a><a>Adobe Acrobat Reader</a>\n\n<br><a>Download (239Kb)</a><ul></ul></td></tr>\n\n</table>\n\n<table><tr><th>URL:</th><td><a>http://cdsads.u-strasbg.fr/abs/1997ESASP.419..291S</a></td></tr>\n\n<tr><th>Google Scholar:</th><td><a>Look up in Google Scholar</a></td></tr></table>\n\n<h2>Abstract</h2><p>We present some preliminary results obtained with the LWS G.T. programme on the study of young objects driving molecular outflows. In particular, we discuss the importance of molecular emission in these sources and address the role of the H <sub>2</sub> 0 cooling.</p><table><tr><th>Item Type:</th><td>Conference Item</td></tr>\n\n<tr><th>Copyright Holders:</th><td>1997 European Space Agency</td></tr>\n\n<tr><th>Extra Information:</th><td>Proceedings of the first ISO workshop on Analytical Spectroscopy, Madrid, Spain, 6-8 October 1997. Editors: A.M. Heras, K. Leech, N. R. Trams, and Michael Perry. Noordwijk, The Netherlands : ESA Publications Division, c1997. (ESA SP-419), 1997., pp.291-292</td></tr>\n\n<tr><th>Academic Unit/Department:</th><td><a>Science > Physical Sciences</a></td></tr>\n\n<tr><th>Interdisciplinary Research Centre:</th><td><a>Centre for Earth, Planetary, Space and Astronomical Research (CEPSAR)</a></td></tr>\n\n<tr><th>Item ID:</th><td>32696</td></tr>\n\n<tr><th>Depositing User:</th><td><span>Glenn White</span></td></tr>\n\n<tr>' in content
+        assert '<table><tr><td>Molecular line emissions from pre main sequence objects\n\nSaraceno, P. ; Benedettini, M. ; Caux, E. ; Ceccarelli, M. C. ; Clegg, P. E. ; Correia, J. C. ; di Giorgio, A. M. ; Giannini, T. ; Griffin, M. J. ; Leeks, S. J. ; Liseau, R. ; Lorenzetti, D. ; Molinari, S. ; Nisini, B. ; Smith, H. ; Spinoglio, L. ; Tomassi, E. and White, G. J. (1997). Molecular line emissions from pre main sequence objects. In: The first ISO workshop on Analytical Spectroscopy , 6-8 October 1997, Madrid, Spain, p. 291.\n\nFull text available as:<table><tr><td><table><tr><td>Preview</td></tr></table></td><td>PDF (Version of Record) - Requires a PDF viewer such asGSview ,Xpdf orAdobe Acrobat Reader\n\nDownload (239Kb)</td></tr></table><table><tr><th>URL:</th><td>http://cdsads.u-strasbg.fr/abs/1997ESASP.419..291S</td></tr><tr><th>Google Scholar:</th><td>Look up in Google Scholar</td></tr></table>Abstract\n\nWe present some preliminary results obtained with the LWS G.T. programme on the study of young objects driving molecular outflows. In particular, we discuss the importance of molecular emission in these sources and address the role of the H<sub>2</sub>0 cooling.<table><tr><th>Item Type:</th><td>Conference Item</td></tr><tr><th>Copyright Holders:</th><td>1997 European Space Agency</td></tr><tr><th>Extra Information:</th><td>Proceedings of the first ISO workshop on Analytical Spectroscopy, Madrid, Spain, 6-8 October 1997. Editors: A.M. Heras, K. Leech, N. R. Trams, and Michael Perry. Noordwijk, The Netherlands : ESA Publications Division, c1997. (ESA SP-419), 1997., pp.291-292</td></tr><tr><th>Academic Unit/Department:</th><td>Science > Physical Sciences</td></tr><tr><th>Interdisciplinary Research Centre:</th><td>Centre for Earth, Planetary, Space and Astronomical Research (CEPSAR)</td></tr><tr><th>Item ID:</th><td>32696</td></tr><tr><th>Depositing User:</th><td>Glenn White</td></tr>' in content
diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_title.py b/tests/llm_web_kit/extractor/html/recognizer/test_title.py
index a0b71fa4..2edfbeab 100644
--- a/tests/llm_web_kit/extractor/html/recognizer/test_title.py
+++ b/tests/llm_web_kit/extractor/html/recognizer/test_title.py
@@ -18,17 +18,15 @@ def test_title_recognizer(title_recognizer):
 
     result = title_recognizer.recognize('http://www.baidu.com', [(html_content, html_content)], html_content)
     assert len(result) == 10
-    assert element_to_html(result[0][0]) == """<html><body><cctitle level="1" html="&lt;h1&gt;大模型好，大模型棒1&lt;/h1&gt;
-        ">大模型好，大模型棒1</cctitle></body></html>"""
-    assert element_to_html(result[6][0]) == """<html><body><cctitle level="3" html="&lt;h3&gt;大模型好，大模型棒5&lt;span&gt;大模型很棒&lt;/span&gt;&lt;/h3&gt;
-        ">大模型好，大模型棒5 大模型很棒</cctitle></body></html>"""
+    assert element_to_html(result[0][0]) == """<html><body><cctitle level="1" html="&lt;h1&gt;大模型好，大模型棒1&lt;/h1&gt;">大模型好，大模型棒1</cctitle></body></html>"""
+    assert element_to_html(result[6][0]) == """<html><body><cctitle level="3" html="&lt;h3&gt;大模型好，大模型棒5&lt;span&gt;大模型很棒&lt;/span&gt;&lt;/h3&gt;">大模型好，大模型棒5 大模型很棒</cctitle></body></html>"""
 
 
 def test_title_tails_and_levels(title_recognizer):
     html_content = """<h4>TEST:<cccode-inline>import *</cccode-inline>TEST</h4>Tail<p>aaa</p>"""
     result = title_recognizer.recognize('http://www.baidu.com', [(html_content, html_content)], html_content)
     assert len(result) == 2
-    assert element_to_html(result[0][0]) == '<div><cctitle level="4" html="&lt;h4&gt;TEST:&lt;cccode-inline&gt;import *&lt;/cccode-inline&gt;TEST&lt;/h4&gt;Tail">TEST: `import *` TEST</cctitle></div>'
+    assert element_to_html(result[0][0]) == '<div><cctitle level="4" html="&lt;h4&gt;TEST:&lt;cccode-inline&gt;import *&lt;/cccode-inline&gt;TEST&lt;/h4&gt;">TEST: `import *` TEST</cctitle></div>'
     pass
 
 
@@ -47,4 +45,4 @@ def test_title1(title_recognizer):
     with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/title1.html', 'r') as file:
         html_content = file.read()
     result = title_recognizer.recognize('http://www.baidu.com', [(main_html_content, main_html_content)], html_content)
-    assert 'Compare vibrational frequencies for two calculations for C &lt;sub&gt;3&lt;/sub&gt;  (carbon trimer)' in element_to_html(result[1][0])
+    assert 'Compare vibrational frequencies for two calculations for C&lt;sub&gt;3&lt;/sub&gt; (carbon trimer)' in element_to_html(result[1][0])
diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py
index 5255efe9..533470ef 100644
--- a/tests/llm_web_kit/extractor/test_extractor_chain.py
+++ b/tests/llm_web_kit/extractor/test_extractor_chain.py
@@ -374,7 +374,7 @@ def test_table_include_math_p(self):
         result = chain.extract(input_data)
         content_list = result.get_content_list()._get_data()
         assert len(content_list[0]) == 17
-        assert content_list[0][3]['content']['html'] == "<table><tr><td><div>up vote 17 down vote favorite \n\n 5</div></td><td><div><div>I'm having problems with exercises on proving whether or not a given number is prime. Is $83^{27} + 1$ prime?</div><div>prime-numbers factoring</div><table><tr><td></td><td></td><td></td></tr></table></div></td></tr><tr><td></td><td></td></tr></table>"
+        assert content_list[0][3]['content']['html'] == "<table><tr><td>up vote 17 down vote favorite\n\n5</td><td>I'm having problems with exercises on proving whether or not a given number is prime. Is $83^{27} + 1$ prime?\n\nprime-numbers factoring<table><tr><td></td><td></td><td></td></tr></table></td></tr><tr><td></td><td></td></tr></table>"
 
     def test_table_include_math_p_2(self):
         """table包含math和其他内容."""
@@ -386,7 +386,7 @@ def test_table_include_math_p_2(self):
         md_content = result.get_content_list().to_nlp_md()
         # with open('output_badcase_p2.md', 'w', encoding='utf-8') as f:
         #     f.write(md_content)
-        self.assertIn('<table><tr><td>单位换算：</td><td><p>数学公式区块： $1\\text{km}={10}^{3}\\text{m}$</p><table><tr><td>长度</td><td>质量</td><td>时间</td></tr>', md_content)
+        self.assertIn('<table><tr><td>单位换算：</td><td>数学公式区块： $1\\text{km}={10}^{3}\\text{m}$<table><tr><td>长度</td><td>质量</td><td>时间</td></tr>', md_content)
 
     def test_clean_tags(self):
         """测试clean_tag的preExtractor是否生效."""
@@ -491,7 +491,7 @@ def test_more_nt(self):
         result_content_list = result.get_content_list()._get_data()
         result = result_content_list[0][2]['content']['html']
         assert '\n\t' not in result
-        assert len(result) == 2205
+        assert len(result) == 1893
 
     def test_math_physicsforums(self):
         """测试math_physicsforums网页中数学公式是[tex]和[itex]包裹的，且中间还有<br>标签分割."""
@@ -636,7 +636,7 @@ def test_table_lack_pre_content(self):
         input_data = DataJson(test_data)
         result = chain.extract(input_data)
         result_content_list = result.get_content_list()._get_data()
-        assert result_content_list[0][22]['content']['html'] == '<table><colgroup></colgroup><tr><th>お名前<span>【必須】</span></th><td></td><th>お名前（カナ）</th><td></td></tr><tr><th>ご連絡先<span>【いずれか必須】</span></th><td colspan="3"><table><td><p>メールアドレス</p></td><td><p>電話番号</p></td></table><p>※メール受信制限をしている方は、@chintai.co.jpからのメールを受信できるよう設定の変更をお願い致します。</p></td></tr></table>'
+        assert result_content_list[0][22]['content']['html'] == '<table><tr><th>お名前【必須】</th><td></td><th>お名前（カナ）</th><td></td></tr><tr><th>ご連絡先【いずれか必須】</th><td colspan="3"><table><td>メールアドレス</td><td>電話番号</td></table>※メール受信制限をしている方は、@chintai.co.jpからのメールを受信できるよう設定の変更をお願い致します。</td></tr></table>'
 
     def test_td_include_specila_symbol(self):
         """测试td包含特殊符号|，需要转义."""