From 93122824be3dabac1bb7316c6c0393047d1192d7 Mon Sep 17 00:00:00 2001 From: houlinfeng Date: Mon, 24 Nov 2025 21:22:41 +0800 Subject: [PATCH 1/4] =?UTF-8?q?refactor:=20table=E8=BE=93=E5=87=BA?= =?UTF-8?q?=E6=A0=BC=E5=BC=8F=E9=87=8D=E6=9E=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../extractor/html/recognizer/table.py | 24 ++++++++++++------- .../table_to_content_list_complex_res.json | 5 ++-- .../table_to_content_list_simple_res.json | 2 +- .../extractor/html/recognizer/test_table.py | 1 - .../extractor/test_extractor_chain.py | 22 ++++++++--------- .../input/assets/content_json.json | 9 +++---- 6 files changed, 35 insertions(+), 28 deletions(-) diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py index c2449bb7..7ec23317 100644 --- a/llm_web_kit/extractor/html/recognizer/table.py +++ b/llm_web_kit/extractor/html/recognizer/table.py @@ -100,17 +100,25 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h # 使用传入的 raw_html_segment 或将 parsed_content 转换为字符串 if table_type: cc_table_type = DocElementType.COMPLEX_TABLE + d = { + 'type': cc_table_type, + 'content': { + 'html': html_content, + 'table_nest_level': table_nest_level, + "caption": [], + "footnote": [] + } + } else: cc_table_type = DocElementType.SIMPLE_TABLE - d = { - 'type': cc_table_type, - # 'raw_content': raw_html_segment, - 'content': { - 'html': html_content, - 'is_complex': table_type, - 'table_nest_level': table_nest_level + d = { + 'type': cc_table_type, + 'content': { + 'html': html_content, + "caption": [], + "footnote": [] + } } - } return d def __is_contain_cc_html(self, cc_html: HtmlElement) -> bool: diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_complex_res.json b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_complex_res.json index 9216b23b..ba341040 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_complex_res.json +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_complex_res.json @@ -2,7 +2,8 @@ "type": "complex_table", "content": { "html": "
ফেব্রুয়ারি ২০২৪
সোমমঙ্গলবুধবৃহশুক্রশনিরবি
« জানুয়ারি
১০১১
১২১৩১৪১৫১৬১৭১৮
১৯২০২১২২২৩২৪২৫
২৬২৭২৮২৯
", - "is_complex": true, - "table_nest_level": null + "table_nest_level": null, + "caption": [], + "footnote": [] } } diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_simple_res.json b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_simple_res.json index 57412c32..95c43154 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_simple_res.json +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/recognizer/table_to_content_list_simple_res.json @@ -1 +1 @@ -{"type": "simple_table", "content": {"html": "
12
34
", "is_complex": false}} \ No newline at end of file +{"type": "simple_table", "content": {"html": "
12
34
", "caption": [], "footnote": []}} \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py index bc91d52d..fe17c919 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_table.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_table.py @@ -131,7 +131,6 @@ def test_table_to_content_list_node_simple(self): expect = base_dir.joinpath(test_case['expected'][0]) expect_json = expect.read_text(encoding='utf-8') assert result['type'] == json.loads(expect_json)['type'] - assert result['content']['is_complex'] == json.loads(expect_json)['content']['is_complex'] self.assertTrue(result['content']['html'].startswith('')) self.assertTrue(result['content']['html'].endswith('
')) diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index d5af6d7e..44680347 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -112,13 +112,11 @@ def test_html_pipeline(self): # 然后是simple table html_content = html_content_list[4] self.assertEqual(html_content['type'], DocElementType.SIMPLE_TABLE) - self.assertEqual(html_content['content']['is_complex'], False) assert html_content['content']['html'].startswith('1.12.13.14.1", - "is_complex": false, - "table_nest_level": "1" + "caption": [], + "footnote": [] } }, { @@ -100,8 +100,9 @@ "type": "complex_table", "content": { "html": "
123
4
567
", - "is_complex": true, - "table_nest_level": "1" + "table_nest_level": "1", + "caption": [], + "footnote": [] } }, { From 7455851fc979dc0cec8221573858419c744f1b66 Mon Sep 17 00:00:00 2001 From: houlinfeng Date: Tue, 25 Nov 2025 21:16:51 +0800 Subject: [PATCH 2/4] =?UTF-8?q?refactor:=20=E6=B8=85=E9=99=A4markdown?= =?UTF-8?q?=E4=B8=AD=E5=86=97=E4=BD=99=E7=9A=84=E6=8D=A2=E8=A1=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- llm_web_kit/extractor/html/recognizer/list.py | 6 ++- llm_web_kit/extractor/html/recognizer/text.py | 2 +- llm_web_kit/input/datajson.py | 4 +- .../extractor/html/recognizer/test_text.py | 50 ++++++++++++++----- 4 files changed, 47 insertions(+), 15 deletions(-) diff --git a/llm_web_kit/extractor/html/recognizer/list.py b/llm_web_kit/extractor/html/recognizer/list.py index 723f292f..a33a7712 100644 --- a/llm_web_kit/extractor/html/recognizer/list.py +++ b/llm_web_kit/extractor/html/recognizer/list.py @@ -1,4 +1,5 @@ import json +import re from typing import Any, List, Tuple from lxml import html as lxml_html @@ -196,6 +197,8 @@ def __extract_list_item_text_recusive(el: HtmlElement): if len(paragraph) > 0 and paragraph[-1]['t'] == ParagraphTextType.TEXT: paragraph[-1]['c'] += _new_tail else: + if len(paragraph) > 0 and el.tag not in inline_tags: + _new_tail = '$br$' + _new_tail paragraph.append({'c': _new_tail, 't': ParagraphTextType.TEXT}) if paragraph: @@ -212,7 +215,8 @@ def __extract_list_item_text_recusive(el: HtmlElement): text_paragraph.append(new_paragraph) for n, item in enumerate(text_paragraph): - tem_json = json.dumps(item).replace('$br$', '\\n\\n') + tem_json = json.dumps(item, ensure_ascii=False) + tem_json = re.sub(r'(\s*\$br\$\s*)+', r'\\n', tem_json) text_paragraph[n] = json.loads(tem_json) return text_paragraph diff --git a/llm_web_kit/extractor/html/recognizer/text.py b/llm_web_kit/extractor/html/recognizer/text.py index d6b3e857..0b5b08cc 100644 --- a/llm_web_kit/extractor/html/recognizer/text.py +++ b/llm_web_kit/extractor/html/recognizer/text.py @@ -271,7 +271,7 @@ def __get_paragraph_text_recusive(el: HtmlElement, text: str) -> str: for item in para_text: if item['c'] is not None: - item['c'] = restore_sub_sup_from_text_regex(item['c']).replace('$br$', PARAGRAPH_SEPARATOR) + item['c'] = restore_sub_sup_from_text_regex(item['c']).replace('$br$', '\n') else: item['c'] = "" diff --git a/llm_web_kit/input/datajson.py b/llm_web_kit/input/datajson.py index c9b533c2..b02bc7d1 100644 --- a/llm_web_kit/input/datajson.py +++ b/llm_web_kit/input/datajson.py @@ -51,7 +51,7 @@ class StructureMapper(ABC): def __init__(self): self.__txt_para_splitter = '\n' - self.__md_para_splitter = '\n\n' + self.__md_para_splitter = '' self.__text_end = '\n' self.__list_item_start = '-' # md里的列表项前缀 self.__list_para_prefix = ' ' # 两个空格,md里的列表项非第一个段落的前缀:如果多个段落的情况,第二个以及之后的段落前缀 @@ -117,6 +117,8 @@ def __to_md(self, exclude_nodes=[], exclude_inline_types=[], use_raw_image_url=F if content_lst_node['type'] not in exclude_nodes: txt_content = self.__content_lst_node_2_md(content_lst_node, exclude_inline_types, use_raw_image_url) + if len(md_blocks) > 0 and not txt_content.startswith("\n") and not md_blocks[-1].endswith("\n"): # 若段落间没有换行,则添加换行 + md_blocks.append("\n\n") if txt_content and len(txt_content) > 0: md_blocks.append(txt_content) diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_text.py b/tests/llm_web_kit/extractor/html/recognizer/test_text.py index e3c6119f..924933c4 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_text.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_text.py @@ -31,7 +31,7 @@ def test_text_1(self): '中共中央政治局召开会议审议《成-2020年10月16日新闻联播', 'zh') == '知识乱象\n中共中央政治局召开会议审议《成-2020年10月16日新闻联播' result = self.text_recognize.recognize('http://www.baidu.com', [(html_to_element(html_content), html_to_element(html_content))], html_content) - assert '知识乱象\\n\\n 中共中央政治局' in element_to_html_unescaped(result[587][0]) + assert '知识乱象\\n 中共中央政治局' in element_to_html_unescaped(result[587][0]) def test_text_2(self): """ @@ -53,7 +53,7 @@ def test_text_2(self): input_data = DataJson(test_data) result = chain.extract(input_data) content_md = result.get_content_list().to_mm_md() - assert 'Selecting Rivet Sets:\n\n To develop maximum power' in content_md + assert 'Selecting Rivet Sets:\n To develop maximum power' in content_md def test_text_3(self): """ @@ -75,7 +75,7 @@ def test_text_3(self): input_data = DataJson(test_data) result = chain.extract(input_data) content_md = result.get_content_list().to_mm_md() - assert "1. The problem statement, all variables and given/known data\n\n A woman of height 1.7 meters stands directly in front of a convex mirror 2.0 meters away. The mirror has a radius of curvature, R=-50cm. Find the location and size of a woman's image using the ray diagram and mirror/lens equation.\n\n----------\n\n 2. The speed of light in a material is 2.50x10^8 meters per second. What is the index of refraction of the material?\n\n 2. Relevant equations\n\n 3. The attempt at a solution\n\n 1. di=22.22\n\n 2. Dont know" in content_md + assert "1. The problem statement, all variables and given/known data\n A woman of height 1.7 meters stands directly in front of a convex mirror 2.0 meters away. The mirror has a radius of curvature, R=-50cm. Find the location and size of a woman's image using the ray diagram and mirror/lens equation.\n\n----------\n\n 2. The speed of light in a material is 2.50x10^8 meters per second. What is the index of refraction of the material?\n\n 2. Relevant equations\n\n 3. The attempt at a solution\n 1. di=22.22\n\n 2. Dont know" in content_md def test_text_4(self): """ @@ -97,7 +97,7 @@ def test_text_4(self): input_data = DataJson(test_data) result = chain.extract(input_data) content_md = result.get_content_list().to_mm_md() - assert '1. The problem statement, all variables and given/known data\n\n 2. Relevant equations\n\n See attachment\n\n 3. The attempt at a solution\n\n I solved the problem' in content_md + assert '1. The problem statement, all variables and given/known data\n 2. Relevant equations\n\n See attachment\n\n 3. The attempt at a solution\n I solved the problem' in content_md def test_text_5(self): """ @@ -119,7 +119,7 @@ def test_text_5(self): input_data = DataJson(test_data) result = chain.extract(input_data) content_md = result.get_content_list().to_mm_md() - assert 'Please Note:\n\n 1. Charge the battery on receiving even if it will not be used soon.\n\n 2. Charge the battery EVERY MONTH if not in use for long periods to prevent over-discharging of the battery. This can cause irreparable damage to it.' in content_md + assert 'Please Note:\n 1. Charge the battery on receiving even if it will not be used soon.\n 2. Charge the battery EVERY MONTH if not in use for long periods to prevent over-discharging of the battery. This can cause irreparable damage to it.' in content_md def test_text_6(self): """ @@ -165,7 +165,7 @@ def test_text_8(self): with open(Path(__file__).parent.parent.parent / 'assets/extractor_chain_input/good_data/html/text8.html', 'r') as file: html_content = file.read() result = self.text_recognize.recognize('http://www.baidu.com', [(html_to_element(html_content), html_to_element(html_content))], html_content) - assert "40xy\' -ln(x^8) = 0\\n\\n\\n\\n Initial Condition: y(1)=31\\n\\n\\n\\n Work:" in element_to_html_unescaped(result[69][0]) and BaseHTMLElementRecognizer.is_cc_html(result[69][0]) + assert "40xy\' -ln(x^8) = 0\\n\\n Initial Condition: y(1)=31\\n\\n Work:" in element_to_html_unescaped(result[69][0]) and BaseHTMLElementRecognizer.is_cc_html(result[69][0]) def test_text_9(self): """ @@ -177,7 +177,7 @@ def test_text_9(self): with open(Path(__file__).parent.parent.parent / 'assets/extractor_chain_input/good_data/html/text9.html', 'r') as file: html_content = file.read() result = self.text_recognize.recognize('http://www.baidu.com', [(html_to_element(html_content), html_to_element(html_content))], html_content) - assert '1) Consider the formula f(x)=lim(n-->infinity)((x^n)/(1+x^n)).\\n\\n Let D={x:f(x) is an element of R}. Calculate f(x) for all x elements of D and determine where f: D-->R is continuous.\\n\\n\\n\\n 2) Let f: D-->R and suppose that f(x) greater than equal 0 for all x elements of D. Define sqrt(f)-->R by (sqrt(f))(x) = sqrt(f(x)). If f is continuous at c elements of D, prove that sqrt(f) is continuous at c.' in element_to_html_unescaped(result[63][0]) and BaseHTMLElementRecognizer.is_cc_html(result[63][0]) + assert '1) Consider the formula f(x)=lim(n-->infinity)((x^n)/(1+x^n)).\\n Let D={x:f(x) is an element of R}. Calculate f(x) for all x elements of D and determine where f: D-->R is continuous.\\n\\n 2) Let f: D-->R and suppose that f(x) greater than equal 0 for all x elements of D. Define sqrt(f)-->R by (sqrt(f))(x) = sqrt(f(x)). If f is continuous at c elements of D, prove that sqrt(f) is continuous at c.' in element_to_html_unescaped(result[63][0]) and BaseHTMLElementRecognizer.is_cc_html(result[63][0]) def test_text_10(self): """ @@ -199,7 +199,7 @@ def test_text_10(self): input_data = DataJson(test_data) result = chain.extract(input_data) content_md = result.get_content_list().to_mm_md() - assert 'So far I have 2 sets of questions (but I\'m onlin in the 2nd chapter now\n\n![:smile:](d80757e36ca9835f7237339959a1fa1d929bb5c5297acb457475459d6da12278 "Smile :smile:")\n\n)\n\n 1)\n\n In the book' in content_md + assert 'So far I have 2 sets of questions (but I\'m onlin in the 2nd chapter now\n\n![:smile:](d80757e36ca9835f7237339959a1fa1d929bb5c5297acb457475459d6da12278 "Smile :smile:")\n\n)\n\n 1)\n In the book' in content_md def test_text_11(self): """ @@ -381,7 +381,7 @@ def test_normalize_space2(self): input_data = DataJson(test_data) result = chain.extract(input_data) content_md = result.get_content_list().to_mm_md() - assert 'December 10th 2009, 06:42 PM\n\n fearless901\n\n Can someone please tell me my code wont work, error after error\n\n\n\n im need to write code to get height and time of the fluid in a reservoir, help guys. is my functions wrong? \n\n \n\n \n\n\n\n Code' in content_md + assert 'December 10th 2009, 06:42 PM\nfearless901\nCan someone please tell me my code wont work, error after error\nim need to write code to get height and time of the fluid in a reservoir, help guys. is my functions wrong?\nCode' in content_md def test_normalize_space3(self): """ @@ -405,7 +405,7 @@ def test_normalize_space3(self): input_data = DataJson(test_data) result = chain.extract(input_data) content_md = result.get_content_list().to_mm_md() - assert '### Volume 6, Issue 3, 01 February 1965\n\n- INFRARED LASER ACTION AND LIFETIMES IN ARGON II\n\n F. A. Horrigan , S. H. Koozekanani and R. A. Paananen\n\n Scitation Author Page\n\n PubMed\n\n Google Scholar\n\n Source' in content_md + assert '### Volume 6, Issue 3, 01 February 1965\n\n- INFRARED LASER ACTION AND LIFETIMES IN ARGON II\nF. A. Horrigan , S. H. Koozekanani and R. A. Paananen\nScitation Author Page\nPubMed\nGoogle Scholar\nSource' in content_md def test_normalize_space4(self): """ @@ -429,7 +429,7 @@ def test_normalize_space4(self): input_data = DataJson(test_data) result = chain.extract(input_data) content_md = result.get_content_list().to_mm_md() - assert '1. DrDu\n\n Lieber Hendrik, \n\n kannst Du hierzu was beitragen? \n\n Ich finde keinen rechten Grund' in content_md + assert '1. DrDu\nLieber Hendrik,\nkannst Du hierzu was beitragen?\nIch finde keinen rechten Grund' assert 'Show Ignored Content' not in content_md # 这个是隐藏标签,不应该被识别出来 def test_Lack_content1(self): @@ -478,7 +478,7 @@ def test_para_br(self): input_data = DataJson(test_data) result = chain.extract(input_data) content_md = result.get_content_list().to_mm_md() - assert 'The interquartile range formula is the first quartile subtracted from the third quartile:\n\n $IQR = Q_{3}-Q_{1}' in content_md + assert 'The interquartile range formula is the first quartile subtracted from the third quartile:\n $IQR = Q_{3}-Q_{1}' in content_md def test_para_has_none(self): """ @@ -528,6 +528,32 @@ def test_clean_invisible_elements(self): content_md = result.get_content_list().to_mm_md() assert "Choosing a selection results in a full page refresh." not in content_md + def test_clean_invisible_elements1(self): + """ + 清理隐藏标签 + Returns: + + """ + chain = ExtractSimpleFactory.create(load_pipe_tpl('noclip_html_test')) + self.assertIsNotNone(chain) + test_data = { + 'track_id': 'text_md', + 'dataset_name': 'text_md', + 'url': 'https://br.wikipedia.org/wiki/Faustina_an_Hena%C3%B1', + 'data_source_category': 'HTML', + 'path': '000.html', + 'main_path': '000.html', + 'file_bytes': 1000, + 'meta_info': {'input_datetime': '2020-01-01 00:00:00'}, + 'language': 'en' + } + input_data = DataJson(test_data) + result = chain.extract(input_data) + content_md = result.get_content_list().to_mm_md() + # assert "Choosing a selection results in a full page refresh." not in content_md + with open("/home/PJLAB/houlinfeng/projects/custom_plugins/000.md", 'w', encoding='utf-8') as f: + f.write(content_md) + def test_empty_string_fix(self): """ 测试修复字符串索引越界问题 - 当文本处理中出现空字符串时不应抛出IndexError From 1dc596456567d6c6822af28791ab7070f494ec65 Mon Sep 17 00:00:00 2001 From: houlinfeng Date: Tue, 25 Nov 2025 21:27:45 +0800 Subject: [PATCH 3/4] =?UTF-8?q?fix:=20=E6=B8=85=E9=99=A4=E5=86=97=E4=BD=99?= =?UTF-8?q?=E7=9A=84=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../extractor/html/recognizer/test_text.py | 26 ------------------- 1 file changed, 26 deletions(-) diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_text.py b/tests/llm_web_kit/extractor/html/recognizer/test_text.py index 924933c4..0bd90084 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_text.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_text.py @@ -528,32 +528,6 @@ def test_clean_invisible_elements(self): content_md = result.get_content_list().to_mm_md() assert "Choosing a selection results in a full page refresh." not in content_md - def test_clean_invisible_elements1(self): - """ - 清理隐藏标签 - Returns: - - """ - chain = ExtractSimpleFactory.create(load_pipe_tpl('noclip_html_test')) - self.assertIsNotNone(chain) - test_data = { - 'track_id': 'text_md', - 'dataset_name': 'text_md', - 'url': 'https://br.wikipedia.org/wiki/Faustina_an_Hena%C3%B1', - 'data_source_category': 'HTML', - 'path': '000.html', - 'main_path': '000.html', - 'file_bytes': 1000, - 'meta_info': {'input_datetime': '2020-01-01 00:00:00'}, - 'language': 'en' - } - input_data = DataJson(test_data) - result = chain.extract(input_data) - content_md = result.get_content_list().to_mm_md() - # assert "Choosing a selection results in a full page refresh." not in content_md - with open("/home/PJLAB/houlinfeng/projects/custom_plugins/000.md", 'w', encoding='utf-8') as f: - f.write(content_md) - def test_empty_string_fix(self): """ 测试修复字符串索引越界问题 - 当文本处理中出现空字符串时不应抛出IndexError From 31d85b4517c81e9812c0d44485c85c13cd0e8b84 Mon Sep 17 00:00:00 2001 From: houlinfeng Date: Thu, 27 Nov 2025 16:57:05 +0800 Subject: [PATCH 4/4] fix: update api code --- llm_web_kit/api/database.py | 2 ++ llm_web_kit/api/models/db_models.py | 3 ++- llm_web_kit/api/routers/htmls.py | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/llm_web_kit/api/database.py b/llm_web_kit/api/database.py index f58e1752..0c93a7b7 100644 --- a/llm_web_kit/api/database.py +++ b/llm_web_kit/api/database.py @@ -43,6 +43,8 @@ def initialize(self): pool_pre_ping=True, pool_size=settings.db_pool_size, max_overflow=settings.db_max_overflow, + pool_recycle=3600, + pool_use_lifo=True, ) # 创建异步会话工厂 diff --git a/llm_web_kit/api/models/db_models.py b/llm_web_kit/api/models/db_models.py index 2a15b6c2..871093fc 100644 --- a/llm_web_kit/api/models/db_models.py +++ b/llm_web_kit/api/models/db_models.py @@ -6,6 +6,7 @@ from datetime import datetime from sqlalchemy import Column, DateTime, Integer, String, Text +from sqlalchemy.dialects.mysql import LONGTEXT from sqlalchemy.ext.declarative import declarative_base Base = declarative_base() @@ -19,7 +20,7 @@ class RequestLog(Base): id = Column(Integer, primary_key=True, autoincrement=True, comment='主键ID') request_id = Column(String(64), nullable=False, unique=True, index=True, comment='请求ID') input_type = Column(String(32), nullable=False, comment='输入类型: html_content, url, file') - input_html = Column(Text, nullable=True, comment='输入HTML字符串') + input_html = Column(LONGTEXT, nullable=True, comment='输入HTML字符串') url = Column(Text, nullable=True, comment='输入URL地址') output_markdown = Column(Text, nullable=True, comment='输出Markdown内容') status = Column(String(32), default='processing', nullable=False, comment='状态: processing-处理中, success-成功, fail-失败') diff --git a/llm_web_kit/api/routers/htmls.py b/llm_web_kit/api/routers/htmls.py index 0a505ba6..b329e5e8 100644 --- a/llm_web_kit/api/routers/htmls.py +++ b/llm_web_kit/api/routers/htmls.py @@ -138,7 +138,7 @@ async def upload_html_file( except Exception as commit_error: logger.error(f'提交初始日志时出错: {commit_error}') - result = await html_service.parse_html(html_content=html_content) + result = await html_service.parse_html(html_content=html_content, url="www.baidu.com") # 更新日志为成功 await RequestLogService.update_log_success(