From 93122824be3dabac1bb7316c6c0393047d1192d7 Mon Sep 17 00:00:00 2001
From: houlinfeng <m15237195947@163.com>
Date: Mon, 24 Nov 2025 21:22:41 +0800
Subject: [PATCH 1/4] =?UTF-8?q?refactor:=20table=E8=BE=93=E5=87=BA?=
 =?UTF-8?q?=E6=A0=BC=E5=BC=8F=E9=87=8D=E6=9E=84?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../extractor/html/recognizer/table.py        | 24 ++++++++++++-------
 .../table_to_content_list_complex_res.json    |  5 ++--
 .../table_to_content_list_simple_res.json     |  2 +-
 .../extractor/html/recognizer/test_table.py   |  1 -
 .../extractor/test_extractor_chain.py         | 22 ++++++++---------
 .../input/assets/content_json.json            |  9 +++----
 6 files changed, 35 insertions(+), 28 deletions(-)
diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py
index c2449bb7..7ec23317 100644
--- a/llm_web_kit/extractor/html/recognizer/table.py
+++ b/llm_web_kit/extractor/html/recognizer/table.py
@@ -100,17 +100,25 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h
         # 使用传入的 raw_html_segment 或将 parsed_content 转换为字符串
         if table_type:
             cc_table_type = DocElementType.COMPLEX_TABLE
+            d = {
+                'type': cc_table_type,
+                'content': {
+                    'html': html_content,
+                    'table_nest_level': table_nest_level,
+                    "caption": [],
+                    "footnote": []
+                }
+            }
         else:
             cc_table_type = DocElementType.SIMPLE_TABLE
-        d = {
-            'type': cc_table_type,
-            # 'raw_content': raw_html_segment,
-            'content': {
-                'html': html_content,
-                'is_complex': table_type,
-                'table_nest_level': table_nest_level
+            d = {
+                'type': cc_table_type,
+                'content': {
+                    'html': html_content,
+                    "caption": [],
+                    "footnote": []
+                }
             }
-        }
         return d
 
     def __is_contain_cc_html(self, cc_html: HtmlElement) -> bool:
diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_complex_res.json b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_complex_res.json
index 9216b23b..ba341040 100644
--- a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_complex_res.json
+++ b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_complex_res.json
@@ -2,7 +2,8 @@
     "type": "complex_table",
     "content": {
         "html": "<table><caption>ফেব্রুয়ারি ২০২৪</caption><thead><tr><th>সোম</th><th>মঙ্গল</th><th>বুধ</th><th>বৃহ</th><th>শুক্র</th><th>শনি</th><th>রবি</th></tr></thead><tfoot><tr><td colspan=\\\"3\\\">« জানুয়ারি</td><td></td><td colspan=\\\"3\\\"></td></tr></tfoot><tbody><tr><td colspan=\\\"3\\\"></td><td>১</td><td>২</td><td>৩</td><td>৪</td></tr><tr><td>৫</td><td>৬</td><td>৭</td><td>৮</td><td>৯</td><td>১০</td><td>১১</td></tr><tr><td>১২</td><td>১৩</td><td>১৪</td><td>১৫</td><td>১৬</td><td>১৭</td><td>১৮</td></tr><tr><td>১৯</td><td>২০</td><td>২১</td><td>২২</td><td>২৩</td><td>২৪</td><td>২৫</td></tr><tr><td>২৬</td><td>২৭</td><td>২৮</td><td>২৯</td><td colspan=\\\"3\\\"></td></tr></tbody></table>",
-        "is_complex": true,
-        "table_nest_level": null
+        "table_nest_level": null,
+        "caption": [],
+        "footnote": []
     }
 }
diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_simple_res.json b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_simple_res.json
index 57412c32..95c43154 100644
--- a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_simple_res.json
+++ b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_simple_res.json
@@ -1 +1 @@
-{"type": "simple_table", "content": {"html": "<table><tr><td>1</td><td>2</td></tr><tr><td>3</td><td>4</td></tr></table>", "is_complex": false}}
\ No newline at end of file
+{"type": "simple_table", "content": {"html": "<table><tr><td>1</td><td>2</td></tr><tr><td>3</td><td>4</td></tr></table>", "caption": [], "footnote": []}}
\ No newline at end of file
diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py
index bc91d52d..fe17c919 100644
--- a/tests/llm_web_kit/extractor/html/recognizer/test_table.py
+++ b/tests/llm_web_kit/extractor/html/recognizer/test_table.py
@@ -131,7 +131,6 @@ def test_table_to_content_list_node_simple(self):
             expect = base_dir.joinpath(test_case['expected'][0])
             expect_json = expect.read_text(encoding='utf-8')
             assert result['type'] == json.loads(expect_json)['type']
-            assert result['content']['is_complex'] == json.loads(expect_json)['content']['is_complex']
             self.assertTrue(result['content']['html'].startswith('<table>'))
             self.assertTrue(result['content']['html'].endswith('</table>'))
 
diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py
index d5af6d7e..44680347 100644
--- a/tests/llm_web_kit/extractor/test_extractor_chain.py
+++ b/tests/llm_web_kit/extractor/test_extractor_chain.py
@@ -112,13 +112,11 @@ def test_html_pipeline(self):
         # 然后是simple table
         html_content = html_content_list[4]
         self.assertEqual(html_content['type'], DocElementType.SIMPLE_TABLE)
-        self.assertEqual(html_content['content']['is_complex'], False)
         assert html_content['content']['html'].startswith('<table')
 
         # 然后是complex table
         html_content = html_content_list[5]
         self.assertEqual(html_content['type'], DocElementType.COMPLEX_TABLE)
-        self.assertEqual(html_content['content']['is_complex'], True)
 
         # 然后是list
         html_content = html_content_list[6]
@@ -541,8 +539,8 @@ def test_table_span_error(self):
         test_data = self.data_json[31]
         input_data = DataJson(test_data)
         result = chain.extract(input_data)
-        result_flag = result.get_content_list()._get_data()[0][0]['content']['is_complex']
-        assert result_flag is True
+        result_flag = result.get_content_list()._get_data()[0][0]['type']
+        assert result_flag == "complex_table"
 
     def test_table_colspan_error(self):
         """测试table的colspan标签为字符串引起的异常错误."""
@@ -551,8 +549,8 @@ def test_table_colspan_error(self):
         test_data = self.data_json[32]
         input_data = DataJson(test_data)
         result = chain.extract(input_data)
-        result_flag = result.get_content_list()._get_data()[0][15]['content']['is_complex']
-        assert result_flag is False
+        result_flag = result.get_content_list()._get_data()[0][15]['type']
+        assert result_flag == "simple_table"
 
     def test_table_colspan_percent_err(self):
         """测试table的colspan标签为百分数引起的异常错误."""
@@ -561,8 +559,8 @@ def test_table_colspan_percent_err(self):
         test_data = self.data_json[33]
         input_data = DataJson(test_data)
         result = chain.extract(input_data)
-        result_flag = result.get_content_list()._get_data()[0][0]['content']['is_complex']
-        assert result_flag is True
+        result_flag = result.get_content_list()._get_data()[0][0]['type']
+        assert result_flag == "complex_table"
 
     def test_table_colspan_str_error(self):
         """测试table的colspan标签为字符串引起的异常错误."""
@@ -571,8 +569,8 @@ def test_table_colspan_str_error(self):
         test_data = self.data_json[34]
         input_data = DataJson(test_data)
         result = chain.extract(input_data)
-        result_flag = result.get_content_list()._get_data()[0][28]['content']['is_complex']
-        assert result_flag is False
+        result_flag = result.get_content_list()._get_data()[0][28]['type']
+        assert result_flag == "simple_table"
 
     def test_table_invalid_percent(self):
         """测试table的colspan标签为百分数引起的异常错误."""
@@ -581,8 +579,8 @@ def test_table_invalid_percent(self):
         test_data = self.data_json[35]
         input_data = DataJson(test_data)
         result = chain.extract(input_data)
-        result_flag = result.get_content_list()._get_data()[0][0]['content']['is_complex']
-        assert result_flag is False
+        result_flag = result.get_content_list()._get_data()[0][0]['type']
+        assert result_flag == "simple_table"
 
     def test_maigc_html(self):
         """测试magic-html."""
diff --git a/tests/llm_web_kit/input/assets/content_json.json b/tests/llm_web_kit/input/assets/content_json.json
index a5bf4ef8..50f77a7a 100644
--- a/tests/llm_web_kit/input/assets/content_json.json
+++ b/tests/llm_web_kit/input/assets/content_json.json
@@ -83,8 +83,8 @@
         "type": "simple_table",
         "content": {
           "html": "<table><tr><td>1.1</td><td>2.1</td></tr><tr><td>3.1</td><td>4.1</td></tr></table>",
-          "is_complex": false,
-          "table_nest_level": "1"
+          "caption": [],
+          "footnote": []
         }
       },
       {
@@ -100,8 +100,9 @@
         "type": "complex_table",
         "content": {
           "html": "<table><tr><td rowspan=\"2\">1</td><td>2</td><td>3</td></tr><tr><td colspan=\"2\">4</td></tr><tr><td>5</td><td>6</td><td>7</td></tr></table>",
-          "is_complex": true,
-          "table_nest_level": "1"
+          "table_nest_level": "1",
+          "caption": [],
+          "footnote": []
         }
       },
       {

From 7455851fc979dc0cec8221573858419c744f1b66 Mon Sep 17 00:00:00 2001
From: houlinfeng <m15237195947@163.com>
Date: Tue, 25 Nov 2025 21:16:51 +0800
Subject: [PATCH 2/4] =?UTF-8?q?refactor:=20=E6=B8=85=E9=99=A4markdown?=
 =?UTF-8?q?=E4=B8=AD=E5=86=97=E4=BD=99=E7=9A=84=E6=8D=A2=E8=A1=8C?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 llm_web_kit/extractor/html/recognizer/list.py |  6 ++-
 llm_web_kit/extractor/html/recognizer/text.py |  2 +-
 llm_web_kit/input/datajson.py                 |  4 +-
 .../extractor/html/recognizer/test_text.py    | 50 ++++++++++++++-----
 4 files changed, 47 insertions(+), 15 deletions(-)

diff --git a/llm_web_kit/extractor/html/recognizer/list.py b/llm_web_kit/extractor/html/recognizer/list.py
index 723f292f..a33a7712 100644
--- a/llm_web_kit/extractor/html/recognizer/list.py
+++ b/llm_web_kit/extractor/html/recognizer/list.py
@@ -1,4 +1,5 @@
 import json
+import re
 from typing import Any, List, Tuple
 
 from lxml import html as lxml_html
@@ -196,6 +197,8 @@ def __extract_list_item_text_recusive(el: HtmlElement):
                     if len(paragraph) > 0 and paragraph[-1]['t'] == ParagraphTextType.TEXT:
                         paragraph[-1]['c'] += _new_tail
                 else:
+                    if len(paragraph) > 0 and el.tag not in inline_tags:
+                        _new_tail = '$br$' + _new_tail
                     paragraph.append({'c': _new_tail, 't': ParagraphTextType.TEXT})
 
             if paragraph:
@@ -212,7 +215,8 @@ def __extract_list_item_text_recusive(el: HtmlElement):
             text_paragraph.append(new_paragraph)
 
         for n, item in enumerate(text_paragraph):
-            tem_json = json.dumps(item).replace('$br$', '\\n\\n')
+            tem_json = json.dumps(item, ensure_ascii=False)
+            tem_json = re.sub(r'(\s*\$br\$\s*)+', r'\\n', tem_json)
             text_paragraph[n] = json.loads(tem_json)
 
         return text_paragraph
diff --git a/llm_web_kit/extractor/html/recognizer/text.py b/llm_web_kit/extractor/html/recognizer/text.py
index d6b3e857..0b5b08cc 100644
--- a/llm_web_kit/extractor/html/recognizer/text.py
+++ b/llm_web_kit/extractor/html/recognizer/text.py
@@ -271,7 +271,7 @@ def __get_paragraph_text_recusive(el: HtmlElement, text: str) -> str:
 
         for item in para_text:
             if item['c'] is not None:
-                item['c'] = restore_sub_sup_from_text_regex(item['c']).replace('$br$', PARAGRAPH_SEPARATOR)
+                item['c'] = restore_sub_sup_from_text_regex(item['c']).replace('$br$', '\n')
             else:
                 item['c'] = ""
 
diff --git a/llm_web_kit/input/datajson.py b/llm_web_kit/input/datajson.py
index c9b533c2..b02bc7d1 100644
--- a/llm_web_kit/input/datajson.py
+++ b/llm_web_kit/input/datajson.py
@@ -51,7 +51,7 @@ class StructureMapper(ABC):
 
     def __init__(self):
         self.__txt_para_splitter = '\n'
-        self.__md_para_splitter = '\n\n'
+        self.__md_para_splitter = ''
         self.__text_end = '\n'
         self.__list_item_start = '-'  # md里的列表项前缀
         self.__list_para_prefix = '  '  # 两个空格，md里的列表项非第一个段落的前缀：如果多个段落的情况，第二个以及之后的段落前缀
@@ -117,6 +117,8 @@ def __to_md(self, exclude_nodes=[], exclude_inline_types=[], use_raw_image_url=F
                 if content_lst_node['type'] not in exclude_nodes:
                     txt_content = self.__content_lst_node_2_md(content_lst_node, exclude_inline_types,
                                                                use_raw_image_url)
+                    if len(md_blocks) > 0 and not txt_content.startswith("\n") and not md_blocks[-1].endswith("\n"):  # 若段落间没有换行，则添加换行
+                        md_blocks.append("\n\n")
                     if txt_content and len(txt_content) > 0:
                         md_blocks.append(txt_content)
 
diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_text.py b/tests/llm_web_kit/extractor/html/recognizer/test_text.py
index e3c6119f..924933c4 100644
--- a/tests/llm_web_kit/extractor/html/recognizer/test_text.py
+++ b/tests/llm_web_kit/extractor/html/recognizer/test_text.py
@@ -31,7 +31,7 @@ def test_text_1(self):
                                                                           '中共中央政治局召开会议审议《成-2020年10月16日新闻联播',
                                                                           'zh') == '知识乱象\n中共中央政治局召开会议审议《成-2020年10月16日新闻联播'
         result = self.text_recognize.recognize('http://www.baidu.com', [(html_to_element(html_content), html_to_element(html_content))], html_content)
-        assert '知识乱象\\n\\n 中共中央政治局' in element_to_html_unescaped(result[587][0])
+        assert '知识乱象\\n 中共中央政治局' in element_to_html_unescaped(result[587][0])
 
     def test_text_2(self):
         """
@@ -53,7 +53,7 @@ def test_text_2(self):
         input_data = DataJson(test_data)
         result = chain.extract(input_data)
         content_md = result.get_content_list().to_mm_md()
-        assert 'Selecting Rivet Sets:\n\n To develop maximum power' in content_md
+        assert 'Selecting Rivet Sets:\n To develop maximum power' in content_md
 
     def test_text_3(self):
         """
@@ -75,7 +75,7 @@ def test_text_3(self):
         input_data = DataJson(test_data)
         result = chain.extract(input_data)
         content_md = result.get_content_list().to_mm_md()
-        assert "1. The problem statement, all variables and given/known data\n\n A woman of height 1.7 meters stands directly in front of a convex mirror 2.0 meters away. The mirror has a radius of curvature, R=-50cm. Find the location and size of a woman's image using the ray diagram and mirror/lens equation.\n\n----------\n\n 2. The speed of light in a material is 2.50x10^8 meters per second. What is the index of refraction of the material?\n\n 2. Relevant equations\n\n 3. The attempt at a solution\n\n 1. di=22.22\n\n 2. Dont know" in content_md
+        assert "1. The problem statement, all variables and given/known data\n A woman of height 1.7 meters stands directly in front of a convex mirror 2.0 meters away. The mirror has a radius of curvature, R=-50cm. Find the location and size of a woman's image using the ray diagram and mirror/lens equation.\n\n----------\n\n 2. The speed of light in a material is 2.50x10^8 meters per second. What is the index of refraction of the material?\n\n 2. Relevant equations\n\n 3. The attempt at a solution\n 1. di=22.22\n\n 2. Dont know" in content_md
 
     def test_text_4(self):
         """
@@ -97,7 +97,7 @@ def test_text_4(self):
         input_data = DataJson(test_data)
         result = chain.extract(input_data)
         content_md = result.get_content_list().to_mm_md()
-        assert '1. The problem statement, all variables and given/known data\n\n 2. Relevant equations\n\n See attachment\n\n 3. The attempt at a solution\n\n I solved the problem' in content_md
+        assert '1. The problem statement, all variables and given/known data\n 2. Relevant equations\n\n See attachment\n\n 3. The attempt at a solution\n I solved the problem' in content_md
 
     def test_text_5(self):
         """
@@ -119,7 +119,7 @@ def test_text_5(self):
         input_data = DataJson(test_data)
         result = chain.extract(input_data)
         content_md = result.get_content_list().to_mm_md()
-        assert 'Please Note:\n\n 1. Charge the battery on receiving even if it will not be used soon.\n\n 2. Charge the battery EVERY MONTH if not in use for long periods to prevent over-discharging of the battery. This can cause irreparable damage to it.' in content_md
+        assert 'Please Note:\n 1. Charge the battery on receiving even if it will not be used soon.\n 2. Charge the battery EVERY MONTH if not in use for long periods to prevent over-discharging of the battery. This can cause irreparable damage to it.' in content_md
 
     def test_text_6(self):
         """
@@ -165,7 +165,7 @@ def test_text_8(self):
         with open(Path(__file__).parent.parent.parent / 'assets/extractor_chain_input/good_data/html/text8.html', 'r') as file:
             html_content = file.read()
         result = self.text_recognize.recognize('http://www.baidu.com', [(html_to_element(html_content), html_to_element(html_content))], html_content)
-        assert "40xy\' -ln(x^8) = 0\\n\\n\\n\\n Initial Condition: y(1)=31\\n\\n\\n\\n Work:" in element_to_html_unescaped(result[69][0]) and BaseHTMLElementRecognizer.is_cc_html(result[69][0])
+        assert "40xy\' -ln(x^8) = 0\\n\\n Initial Condition: y(1)=31\\n\\n Work:" in element_to_html_unescaped(result[69][0]) and BaseHTMLElementRecognizer.is_cc_html(result[69][0])
 
     def test_text_9(self):
         """
@@ -177,7 +177,7 @@ def test_text_9(self):
         with open(Path(__file__).parent.parent.parent / 'assets/extractor_chain_input/good_data/html/text9.html', 'r') as file:
             html_content = file.read()
         result = self.text_recognize.recognize('http://www.baidu.com', [(html_to_element(html_content), html_to_element(html_content))], html_content)
-        assert '1) Consider the formula f(x)=lim(n--&gt;infinity)((x^n)/(1+x^n)).\\n\\n Let D={x:f(x) is an element of R}. Calculate f(x) for all x elements of D and determine where f: D--&gt;R is continuous.\\n\\n\\n\\n 2) Let f: D--&gt;R and suppose that f(x) greater than equal 0 for all x elements of D. Define sqrt(f)--&gt;R by (sqrt(f))(x) = sqrt(f(x)). If f is continuous at c elements of D, prove that sqrt(f) is continuous at c.' in element_to_html_unescaped(result[63][0]) and BaseHTMLElementRecognizer.is_cc_html(result[63][0])
+        assert '1) Consider the formula f(x)=lim(n--&gt;infinity)((x^n)/(1+x^n)).\\n Let D={x:f(x) is an element of R}. Calculate f(x) for all x elements of D and determine where f: D--&gt;R is continuous.\\n\\n 2) Let f: D--&gt;R and suppose that f(x) greater than equal 0 for all x elements of D. Define sqrt(f)--&gt;R by (sqrt(f))(x) = sqrt(f(x)). If f is continuous at c elements of D, prove that sqrt(f) is continuous at c.' in element_to_html_unescaped(result[63][0]) and BaseHTMLElementRecognizer.is_cc_html(result[63][0])
 
     def test_text_10(self):
         """
@@ -199,7 +199,7 @@ def test_text_10(self):
         input_data = DataJson(test_data)
         result = chain.extract(input_data)
         content_md = result.get_content_list().to_mm_md()
-        assert 'So far I have 2 sets of questions (but I\'m onlin in the 2nd chapter now\n\n![:smile:](d80757e36ca9835f7237339959a1fa1d929bb5c5297acb457475459d6da12278 "Smile    :smile:")\n\n)\n\n 1)\n\n In the book' in content_md
+        assert 'So far I have 2 sets of questions (but I\'m onlin in the 2nd chapter now\n\n![:smile:](d80757e36ca9835f7237339959a1fa1d929bb5c5297acb457475459d6da12278 "Smile    :smile:")\n\n)\n\n 1)\n In the book' in content_md
 
     def test_text_11(self):
         """
@@ -381,7 +381,7 @@ def test_normalize_space2(self):
         input_data = DataJson(test_data)
         result = chain.extract(input_data)
         content_md = result.get_content_list().to_mm_md()
-        assert 'December 10th 2009, 06:42 PM\n\n fearless901\n\n Can someone please tell me my code wont work, error after error\n\n\n\n im need to write code to get height and time of the fluid in a reservoir, help guys. is my functions wrong? \n\n \n\n \n\n\n\n Code' in content_md
+        assert 'December 10th 2009, 06:42 PM\nfearless901\nCan someone please tell me my code wont work, error after error\nim need to write code to get height and time of the fluid in a reservoir, help guys. is my functions wrong?\nCode' in content_md
 
     def test_normalize_space3(self):
         """
@@ -405,7 +405,7 @@ def test_normalize_space3(self):
         input_data = DataJson(test_data)
         result = chain.extract(input_data)
         content_md = result.get_content_list().to_mm_md()
-        assert '### Volume 6, Issue 3, 01 February 1965\n\n- INFRARED LASER ACTION AND LIFETIMES IN ARGON II\n\n F. A. Horrigan , S. H. Koozekanani and R. A. Paananen\n\n Scitation Author Page\n\n PubMed\n\n Google Scholar\n\n Source' in content_md
+        assert '### Volume 6, Issue 3, 01 February 1965\n\n- INFRARED LASER ACTION AND LIFETIMES IN ARGON II\nF. A. Horrigan , S. H. Koozekanani and R. A. Paananen\nScitation Author Page\nPubMed\nGoogle Scholar\nSource' in content_md
 
     def test_normalize_space4(self):
         """
@@ -429,7 +429,7 @@ def test_normalize_space4(self):
         input_data = DataJson(test_data)
         result = chain.extract(input_data)
         content_md = result.get_content_list().to_mm_md()
-        assert '1. DrDu\n\n Lieber Hendrik, \n\n kannst Du hierzu was beitragen? \n\n Ich finde keinen rechten Grund' in content_md
+        assert '1. DrDu\nLieber Hendrik,\nkannst Du hierzu was beitragen?\nIch finde keinen rechten Grund'
         assert 'Show Ignored Content' not in content_md  # 这个是隐藏标签，不应该被识别出来
 
     def test_Lack_content1(self):
@@ -478,7 +478,7 @@ def test_para_br(self):
         input_data = DataJson(test_data)
         result = chain.extract(input_data)
         content_md = result.get_content_list().to_mm_md()
-        assert 'The interquartile range formula is the first quartile subtracted from the third quartile:\n\n $IQR = Q_{3}-Q_{1}' in content_md
+        assert 'The interquartile range formula is the first quartile subtracted from the third quartile:\n $IQR = Q_{3}-Q_{1}' in content_md
 
     def test_para_has_none(self):
         """
@@ -528,6 +528,32 @@ def test_clean_invisible_elements(self):
         content_md = result.get_content_list().to_mm_md()
         assert "Choosing a selection results in a full page refresh." not in content_md
 
+    def test_clean_invisible_elements1(self):
+        """
+        清理隐藏标签
+        Returns:
+
+        """
+        chain = ExtractSimpleFactory.create(load_pipe_tpl('noclip_html_test'))
+        self.assertIsNotNone(chain)
+        test_data = {
+            'track_id': 'text_md',
+            'dataset_name': 'text_md',
+            'url': 'https://br.wikipedia.org/wiki/Faustina_an_Hena%C3%B1',
+            'data_source_category': 'HTML',
+            'path': '000.html',
+            'main_path': '000.html',
+            'file_bytes': 1000,
+            'meta_info': {'input_datetime': '2020-01-01 00:00:00'},
+            'language': 'en'
+        }
+        input_data = DataJson(test_data)
+        result = chain.extract(input_data)
+        content_md = result.get_content_list().to_mm_md()
+        # assert "Choosing a selection results in a full page refresh." not in content_md
+        with open("/home/PJLAB/houlinfeng/projects/custom_plugins/000.md", 'w', encoding='utf-8') as f:
+            f.write(content_md)
+
     def test_empty_string_fix(self):
         """
         测试修复字符串索引越界问题 - 当文本处理中出现空字符串时不应抛出IndexError

From 1dc596456567d6c6822af28791ab7070f494ec65 Mon Sep 17 00:00:00 2001
From: houlinfeng <m15237195947@163.com>
Date: Tue, 25 Nov 2025 21:27:45 +0800
Subject: [PATCH 3/4] =?UTF-8?q?fix:=20=E6=B8=85=E9=99=A4=E5=86=97=E4=BD=99?=
 =?UTF-8?q?=E7=9A=84=E4=BB=A3=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../extractor/html/recognizer/test_text.py    | 26 -------------------
 1 file changed, 26 deletions(-)

diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_text.py b/tests/llm_web_kit/extractor/html/recognizer/test_text.py
index 924933c4..0bd90084 100644
--- a/tests/llm_web_kit/extractor/html/recognizer/test_text.py
+++ b/tests/llm_web_kit/extractor/html/recognizer/test_text.py
@@ -528,32 +528,6 @@ def test_clean_invisible_elements(self):
         content_md = result.get_content_list().to_mm_md()
         assert "Choosing a selection results in a full page refresh." not in content_md
 
-    def test_clean_invisible_elements1(self):
-        """
-        清理隐藏标签
-        Returns:
-
-        """
-        chain = ExtractSimpleFactory.create(load_pipe_tpl('noclip_html_test'))
-        self.assertIsNotNone(chain)
-        test_data = {
-            'track_id': 'text_md',
-            'dataset_name': 'text_md',
-            'url': 'https://br.wikipedia.org/wiki/Faustina_an_Hena%C3%B1',
-            'data_source_category': 'HTML',
-            'path': '000.html',
-            'main_path': '000.html',
-            'file_bytes': 1000,
-            'meta_info': {'input_datetime': '2020-01-01 00:00:00'},
-            'language': 'en'
-        }
-        input_data = DataJson(test_data)
-        result = chain.extract(input_data)
-        content_md = result.get_content_list().to_mm_md()
-        # assert "Choosing a selection results in a full page refresh." not in content_md
-        with open("/home/PJLAB/houlinfeng/projects/custom_plugins/000.md", 'w', encoding='utf-8') as f:
-            f.write(content_md)
-
     def test_empty_string_fix(self):
         """
         测试修复字符串索引越界问题 - 当文本处理中出现空字符串时不应抛出IndexError

From 31d85b4517c81e9812c0d44485c85c13cd0e8b84 Mon Sep 17 00:00:00 2001
From: houlinfeng <m15237195947@163.com>
Date: Thu, 27 Nov 2025 16:57:05 +0800
Subject: [PATCH 4/4] fix: update api code

---
 llm_web_kit/api/database.py         | 2 ++
 llm_web_kit/api/models/db_models.py | 3 ++-
 llm_web_kit/api/routers/htmls.py    | 2 +-
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/llm_web_kit/api/database.py b/llm_web_kit/api/database.py
index f58e1752..0c93a7b7 100644
--- a/llm_web_kit/api/database.py
+++ b/llm_web_kit/api/database.py
@@ -43,6 +43,8 @@ def initialize(self):
                 pool_pre_ping=True,
                 pool_size=settings.db_pool_size,
                 max_overflow=settings.db_max_overflow,
+                pool_recycle=3600,
+                pool_use_lifo=True,
             )
 
             # 创建异步会话工厂
diff --git a/llm_web_kit/api/models/db_models.py b/llm_web_kit/api/models/db_models.py
index 2a15b6c2..871093fc 100644
--- a/llm_web_kit/api/models/db_models.py
+++ b/llm_web_kit/api/models/db_models.py
@@ -6,6 +6,7 @@
 from datetime import datetime
 
 from sqlalchemy import Column, DateTime, Integer, String, Text
+from sqlalchemy.dialects.mysql import LONGTEXT
 from sqlalchemy.ext.declarative import declarative_base
 
 Base = declarative_base()
@@ -19,7 +20,7 @@ class RequestLog(Base):
     id = Column(Integer, primary_key=True, autoincrement=True, comment='主键ID')
     request_id = Column(String(64), nullable=False, unique=True, index=True, comment='请求ID')
     input_type = Column(String(32), nullable=False, comment='输入类型: html_content, url, file')
-    input_html = Column(Text, nullable=True, comment='输入HTML字符串')
+    input_html = Column(LONGTEXT, nullable=True, comment='输入HTML字符串')
     url = Column(Text, nullable=True, comment='输入URL地址')
     output_markdown = Column(Text, nullable=True, comment='输出Markdown内容')
     status = Column(String(32), default='processing', nullable=False, comment='状态: processing-处理中, success-成功, fail-失败')
diff --git a/llm_web_kit/api/routers/htmls.py b/llm_web_kit/api/routers/htmls.py
index 0a505ba6..b329e5e8 100644
--- a/llm_web_kit/api/routers/htmls.py
+++ b/llm_web_kit/api/routers/htmls.py
@@ -138,7 +138,7 @@ async def upload_html_file(
             except Exception as commit_error:
                 logger.error(f'提交初始日志时出错: {commit_error}')
 
-        result = await html_service.parse_html(html_content=html_content)
+        result = await html_service.parse_html(html_content=html_content,  url="www.baidu.com")
 
         # 更新日志为成功
         await RequestLogService.update_log_success(

সোম	মঙ্গল	বুধ	বৃহ	শুক্র	শনি	রবি
« জানুয়ারি
	১	২	৩	৪
৫	৬	৭	৮	৯	১০	১১
১২	১৩	১৪	১৫	১৬	১৭	১৮
১৯	২০	২১	২২	২৩	২৪	২৫
২৬	২৭	২৮	২৯