From c6bfbf420adbfeaa95e04f10ccca3b10e92b6f4f Mon Sep 17 00:00:00 2001 From: chupei Date: Fri, 15 Aug 2025 13:41:12 +0800 Subject: [PATCH 1/8] v3.2.2-released (#524) Co-authored-by: Yanggq <1041206149@qq.com> Co-authored-by: linfeng <56671143+LollipopsAndWine@users.noreply.github.com> --- .../html/recognizer/cc_math/render/mathjax.py | 36 +++++++++++++++++ .../html/recognizer/cc_math/render/render.py | 1 + .../html/recognizer/cc_math/tag_script.py | 19 ++++++++- .../extractor/html/recognizer/ccmath.py | 40 ++++++++++++------- llm_web_kit/extractor/html/recognizer/list.py | 15 +++---- .../good_data/html/math_mathjax_mock.html | 1 + .../good_data/html_data_input.jsonl | 3 +- .../assets/ccmath/math_class_math.html | 1 + .../assets/ccmath/math_class_math_1.html | 0 .../ccmath/math_class_math_inline_1.html | 8 ++++ .../extractor/html/recognizer/test_list.py | 35 ++++++++++++++++ .../extractor/html/recognizer/test_math.py | 8 ++++ .../extractor/test_extractor_chain.py | 19 +++++++-- 13 files changed, 157 insertions(+), 29 deletions(-) create mode 100644 tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/math_mathjax_mock.html create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_class_math.html create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_class_math_1.html create mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_class_math_inline_1.html diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py b/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py index bb309468..06ac62a9 100644 --- a/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py +++ b/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py @@ -533,6 +533,42 @@ def _detect_ascii_math(self, tree: HtmlElement) -> bool: return processascii +class MathJaxRenderMock(MathJaxRender): + """虚拟的MathJax渲染器,用于没有MathJax配置但需要使用MathJax解析逻辑的情况. + + 这个类主要用于处理以下场景: + 1. 网页中没有显式的MathJax配置(如 katex_pattern = re.compile(r'katex.render') node_text = text_strip(text) if katex_pattern.findall(node_text): @@ -28,8 +34,17 @@ def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement, pa target_element = target_elements[0] o_html = element_to_html(target_element) target_element.text = None - new_span = create_new_span([(CCMATH_INLINE,MathType.LATEX)], cm.wrap_math_md(formula_content), target_element, math_render, o_html) + wrapped_formula = cm.wrap_math_md(formula_content) + # 转化为ccmath,例子: + # f(a,b,c) = (a^2+b^2+c^2)^3 + new_span = create_new_span([(CCMATH_INLINE, MathType.LATEX)], wrapped_formula, + target_element, math_render, o_html) + # 插入到span标签内,例子: + # target_element.insert(0, new_span) + + # 处理sript且type为math/tex的节点 + # 例子: elif node.get('type') and 'math/tex' in node.get('type'): tag_math_type_list = cm.get_equation_type(o_html) if not tag_math_type_list: diff --git a/llm_web_kit/extractor/html/recognizer/ccmath.py b/llm_web_kit/extractor/html/recognizer/ccmath.py index 08021dbf..28078250 100644 --- a/llm_web_kit/extractor/html/recognizer/ccmath.py +++ b/llm_web_kit/extractor/html/recognizer/ccmath.py @@ -24,6 +24,7 @@ class MathRecognizer(BaseHTMLElementRecognizer): def __init__(self): super().__init__() self.cm = CCMATH() + self.mathjax_detected = False # 添加检测标记 @override def recognize(self, base_url: str, main_html_lst: List[Tuple[HtmlElement, HtmlElement]], raw_html: str, language:str = 'en') -> List[Tuple[HtmlElement, HtmlElement]]: @@ -122,8 +123,9 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe self.cm.url = base_url tree = cc_html math_render_type = math_render.get_render_type() - # 打印遍历node次数 - # count = 0 + self.mathjax_detected = False # 重置标记 + + # process1: node循环逻辑 for node in iter_node(tree): assert isinstance(node, HtmlElement) original_html = self._element_to_html(node) @@ -134,9 +136,11 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe node.tag == 'span' and node.get('class') in [CSDN.INLINE, CSDN.DISPLAY]): tag_script.process_katex_mathml(self.cm, math_render_type, node) + self.mathjax_detected = True if ZHIHU.DOMAIN in self.cm.url and node.tag == 'span' and node.get('class') == ZHIHU.MATH: tag_script.process_zhihu_custom_tag(self.cm, math_render_type, node) + self.mathjax_detected = True # tag = span, class 为 math-containerm, 或者 mathjax 或者 wp-katex-eq if node.tag == 'span' and node.get('class') and ( @@ -147,44 +151,50 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe 'tex' in node.get('class') ): tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node, parent) - - # script[type="math/tex"] - # if node.tag == 'script' and node.get('type') and 'math/tex' in node.get('type'): - # print('匹配到script标签: ', node.get('type')) - # tag_common_modify.modify_tree(cm, math_render_type, original_html, node, parent) + self.mathjax_detected = True # math tags if node.tag == 'math' or node.tag.endswith(':math'): # print(f"匹配到数学标签: {node.tag}") # print(f"标签内容: {original_html}") tag_math.modify_tree(self.cm, math_render_type, original_html, node, parent) + self.mathjax_detected = True if node.tag == 'mjx-container': tag_mjx.modify_tree(self.cm, math_render, original_html, node) + self.mathjax_detected = True # img中的latex if node.tag == 'img': tag_img.modify_tree(self.cm, math_render_type, original_html, node, parent) + self.mathjax_detected = True # span.katex if node.tag == 'script' or 'math' == node.get('class') or 'katex' == node.get('class'): # print('匹配到script/math/katex标签: ', original_html) tag_script.modify_tree(self.cm, math_render_type, original_html, node, parent) + self.mathjax_detected = True # 只有有渲染器的网站才会走下面文本匹配逻辑 if math_render_type: # 14. 只处理只有一层的p标签 if node.tag == 'p' and len(node.getchildren()) == 0: # print('匹配到p标签: ', original_html) tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node, parent) + self.mathjax_detected = True - # 修改:传入tree节点,mathjax方案作为process2,不参与上面process1节点的遍历 - if math_render_type: - try: - if math_render_type == MathRenderType.MATHJAX: - math_render.find_math(tree) - except Exception as e: - raise HtmlMathMathjaxRenderRecognizerException(f'处理MathjaxRender数学公式失败: {e}') - + # procsee2: mathjax渲染器逻辑 + try: + # case1:有mathjax配置 + if math_render_type == MathRenderType.MATHJAX: + math_render.find_math(tree) + # case2:无Mathjax配置但是开启Mathjax逻辑开关(node循环抽到公式的情况) + elif math_render_type is None and self.mathjax_detected: + from llm_web_kit.extractor.html.recognizer.cc_math.render.mathjax import \ + MathJaxRenderMock + math_render = MathJaxRenderMock() + math_render.find_math(tree) + except Exception as e: + raise HtmlMathMathjaxRenderRecognizerException(f'处理MathjaxRender数学公式失败: {e}') # 保存处理后的html # with open('test20250702_result.html', 'w', encoding='utf-8') as f: # f.write(self._element_to_html(tree)) diff --git a/llm_web_kit/extractor/html/recognizer/list.py b/llm_web_kit/extractor/html/recognizer/list.py index d91caa23..c3599fc8 100644 --- a/llm_web_kit/extractor/html/recognizer/list.py +++ b/llm_web_kit/extractor/html/recognizer/list.py @@ -199,13 +199,14 @@ def __extract_list_item_text_recusive(el: HtmlElement): # item['c'].strip(): 会导致前面处理br标签,添加的\n\n失效 result['c'] = ' '.join(normalize_text_segment(item['c'].strip()) for item in paragraph) return result - list_item_tags = ('li', 'dd', 'dt', 'ul', 'div', 'p', 'span') - if child.tag in list_item_tags: - paragraph = __extract_list_item_text_recusive(child) - if len(paragraph) > 0: - tem_json = json.dumps(paragraph).replace('$br$\"}', '\"}') - new_paragraph = json.loads(tem_json) - text_paragraph.append(new_paragraph) + # list_item_tags = ('li', 'dd', 'dt', 'ul', 'div', 'p', 'span') + # if child.tag in list_item_tags: + # 去掉if限制条件,允许非标准结构的列表通过 + paragraph = __extract_list_item_text_recusive(child) + if len(paragraph) > 0: + tem_json = json.dumps(paragraph).replace('$br$\"}', '\"}') + new_paragraph = json.loads(tem_json) + text_paragraph.append(new_paragraph) for n, item in enumerate(text_paragraph): tem_json = json.dumps(item).replace('$br$', '\\n\\n') diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/math_mathjax_mock.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/math_mathjax_mock.html new file mode 100644 index 00000000..2928e55a --- /dev/null +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/math_mathjax_mock.html @@ -0,0 +1 @@ + Monotone Sequences of Real Numbers - Mathonline
Monotone Sequences of Real Numbers

Monotone Sequences of Real Numbers

We will now look at two new types of sequences, increasing sequences and decreasing sequences.

Definition: A sequence of real numbers $(a_n)$ is said to be Increasing if $a_n ≤ a_{n+1}$ for all $n \in \mathbb{N}$. Similarly, a sequence of real numbers $(a_n)$ is said to be Decreasing if $a_n ≥ a_{n+1}$ for all $n \in \mathbb{N}$. A sequence $(a_n)$ is said to be Monotone or Monotonic if it is either increasing or decreasing.

A sequence $(a_n)$ is said to be Strictly Increasing if $a_n < a_{n+1}$ for all $n \in \mathbb{N}$ and Strictly Decreasing if $a_n > a_{n+1}$ for all $n \in \mathbb{N}$.

For example, consider the sequence $\left ( \frac{1}{n} \right ) = (1, \frac{1}{2}, \frac{1}{3}, ..., \frac{1}{n}, \frac{1}{n+1}, ... )$. We note that $\forall n \in \mathbb{N}$, $n < n+1$ and so $\frac{1}{n} > \frac{1}{n+1}$, and so this sequence is decreasing and hence monotone.

The following graph represents the first 10 terms of the monotonically decreasing sequence $\left ( \frac{1}{n} \right )$:

Screen%20Shot%202014-12-04%20at%203.58.31%20PM.png

One such example of an increasing sequence is the sequence $(n + 2)$. Clearly $\forall n \in \mathbb{N}$, $n + 2 < (n+1) + 2 = n + 3$ (since if not, then $n + 2 ≥ n + 3$ which implies that $0 ≥ 1$, which is a contradiction). The following graph represents the first 10 terms of the monotonically increasing sequence $(n + 2)$:

Screen%20Shot%202014-12-04%20at%204.02.37%20PM.png

From the definition of an increasing and decreasing sequence, we should note that EVERY successive term in the sequence should either be larger than the previous (increasing sequences) or smaller than the previous (decreasing sequences). Therefore the sequence $(1, 2, 1, \frac{1}{2}, \frac{1}{3}, \frac{1}{4}, ...)$ cannot be considered a decreasing sequence as $1 = a_1 \not ≥ a_2 = 2$. From this, we will formulate the following definitions:

Definition: A sequence of real numbers $(a_n)$ is said to be Ultimately Increasing if for some $K \in \mathbb{N}$ we have that $\forall n ≥ K$ then $a_n ≤ a_{n+1}$. Similarly, a sequence of real numbers $(a_n)$ is said to be Ultimately Decreasing if for some $K \in \mathbb{N}$ we have that $\forall n ≥ K$ then $a_n ≥ a_{n+1}$. A sequence $(a_n)$ is said to be Ultimately Monotone or Ultimately Monotonic if for some $K \in \mathbb{N}$, if $n ≥ K$ then $(a_n)$ is either ultimately increasing or ultimately decreasing.

Consider the sequence $(n^2 - 4n + 3) = (0, -1, 0, 3, 8, ...)$. This is an ultimately increasing sequence, since for $n ≥ 2$ we have that $a_n ≤ a_{n+1}$. The following graph represents the first 7 terms of this ultimately increasing sequence:

Screen%20Shot%202014-12-04%20at%204.20.53%20PM.png
Unless otherwise stated, the content of this page is licensed under Creative Commons Attribution-ShareAlike 3.0 License
\ No newline at end of file diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl index 7ccc1eb1..22cebca1 100644 --- a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl @@ -100,4 +100,5 @@ {"track_id": "test_mjx_container", "dataset_name": "test_mjx_container", "url": "https://test.com","data_source_category": "HTML", "path":"testmathjax.html", "file_bytes": 1000, "page_layout_type":"artical", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} {"track_id": "test_word_press", "dataset_name": "test_word_press", "url": "https://test.com","data_source_category": "HTML", "path":"word_press.html", "file_bytes": 1000, "page_layout_type":"artical", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} {"track_id": "test_ascii_delimiter", "dataset_name": "test_ascii_delimiter", "url": "https://montalk.net/notes/342/tuning-forks-and-megalithic-technology","data_source_category": "HTML", "path":"math_test_ascii_delimiter.html", "file_bytes": 1000, "page_layout_type":"artical", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} -{"track_id": "test_htmlmath_sub_sup", "dataset_name": "test_htmlmath_sub_sup", "url": "https://cccbdb.nist.gov/compvibs3.asp?casno=123911&charge=0&method=42&basis=0","data_source_category": "HTML", "path":"math_table_title_htmlmath_sub_sup.html", "file_bytes": 1000, "page_layout_type":"artical", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} \ No newline at end of file +{"track_id": "test_htmlmath_sub_sup", "dataset_name": "test_htmlmath_sub_sup", "url": "https://cccbdb.nist.gov/compvibs3.asp?casno=123911&charge=0&method=42&basis=0","data_source_category": "HTML", "path":"math_table_title_htmlmath_sub_sup.html", "file_bytes": 1000, "page_layout_type":"artical", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} +{"track_id": "test_mathjax_mock", "dataset_name": "test_mathjax_mock", "url": "http://mathonline.wikidot.com/monotone-sequences-of-real-numbers","data_source_category": "HTML", "path":"math_mathjax_mock.html", "file_bytes": 1000, "page_layout_type":"artical", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_class_math.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_class_math.html new file mode 100644 index 00000000..ec9e8518 --- /dev/null +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_class_math.html @@ -0,0 +1 @@ + 4.7 Years In Minutes - How Many Minutes Is 4.7 Years?

Unit Converter

Conversion formula

The conversion factor from years to minutes is 525600, which means that 1 year is equal to 525600 minutes:

1 yr = 525600 min

To convert 4.7 years into minutes we have to multiply 4.7 by the conversion factor in order to get the time amount from years to minutes. We can also form a simple proportion to calculate the result:

1 yr → 525600 min

4.7 yr → T(min)

Solve the above proportion to obtain the time T in minutes:

T(min) = 4.7 yr × 525600 min

T(min) = 2470320 min

The final result is:

4.7 yr → 2470320 min

We conclude that 4.7 years is equivalent to 2470320 minutes:

4.7 years = 2470320 minutes

4.7 years is equal to 2470320 minutes

Alternative conversion

We can also convert by utilizing the inverse value of the conversion factor. In this case 1 minute is equal to 4.0480585511189E-7 × 4.7 years.

Another way is saying that 4.7 years is equal to 1 ÷ 4.0480585511189E-7 minutes.

Approximate result

For practical purposes we can round our final result to an approximate numerical value. We can say that four point seven years is approximately two million four hundred seventy thousand three hundred twenty minutes:

4.7 yr ≅ 2470320 min

An alternative is also that one minute is approximately zero times four point seven years.

Conversion table

years to minutes chart

For quick reference purposes, below is the conversion table you can use to convert from years to minutes

years (yr) minutes (min)
5.7 years 2995920 minutes
6.7 years 3521520 minutes
7.7 years 4047120 minutes
8.7 years 4572720 minutes
9.7 years 5098320 minutes
10.7 years 5623920 minutes
11.7 years 6149520 minutes
12.7 years 6675120 minutes
13.7 years 7200720 minutes
14.7 years 7726320 minutes
\ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_class_math_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_class_math_1.html new file mode 100644 index 00000000..e69de29b diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_class_math_inline_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_class_math_inline_1.html new file mode 100644 index 00000000..cda8dd54 --- /dev/null +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_class_math_inline_1.html @@ -0,0 +1,8 @@ +1 yr = 525600 min +1 yr → 525600 min +4.7 yr → T +T +T +4.7 yr → 2470320 min +4.7 years = 2470320 minutes +4.7 yr ≅ 2470320 min \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_list.py b/tests/llm_web_kit/extractor/html/recognizer/test_list.py index dbe79347..5f8d61de 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_list.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_list.py @@ -375,3 +375,38 @@ def test_get_attribute_standalone_improved(self): error_msg = str(context.exception) self.assertIn('中没有cclist标签', error_msg) self.assertIn(element.tag, error_msg) + + def test_no_standard_get_list_content_list(self): + """测试非标准结构的list获取content_list.""" + # 获取私有方法 __get_list_content_list + get_list_content_list_method = getattr(self.__list_recognize, '_ListRecognizer__get_list_content_list') + + # 创建测试数据 + test_elements = [ + html_to_element('''''') + ] + + for i, element in enumerate(test_elements): + list_content_list = get_list_content_list_method(element, 1) + assert len(list_content_list) == 3 diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_math.py b/tests/llm_web_kit/extractor/html/recognizer/test_math.py index f51c1869..6069c590 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_math.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_math.py @@ -207,6 +207,14 @@ 'base_url': 'https://physicshelpforum.com/t/latex-upgrade-physics-forum-powered-by-mathjax-v3.17489/', 'expected': 'assets/ccmath/math_physicsforums_2_1.html', 'expected_inline': 'assets/ccmath/math_physicsforums_2_inline_1.html' + }, + { + 'input': [ + 'assets/ccmath/math_class_math.html', + ], + 'base_url': 'https://convertoctopus.com/4-7-years-to-minutes', + 'expected': 'assets/ccmath/math_class_math_1.html', + 'expected_inline': 'assets/ccmath/math_class_math_inline_1.html' } ] diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index dc53e015..7f1bf8c9 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -64,7 +64,7 @@ def setUp(self): continue self.data_json.append(json.loads(line)) - assert len(self.data_json) == 103 + assert len(self.data_json) == 104 # Config for HTML extraction self.config = load_pipe_tpl('html-test') @@ -810,16 +810,27 @@ def test_ascii_delimiter(self): input_data = DataJson(test_data) result = chain.extract(input_data) md_content = result.get_content_list().to_nlp_md() - # with open('mathjax抽取case222.md', 'w', encoding='utf-8') as f: - # f.write(md_content) self.assertIn(r'$f = \frac{1}{T} ^ 2 \sqrt{\frac{A E}{\rho}}$', md_content) self.assertIn(r'${m}^{2}$', md_content) self.assertIn(r'\rho$', md_content) self.assertIn(r'$f = \frac{1}{2 L} \sqrt{\frac{E}{\rho}}$', md_content) self.assertIn(r'$L = {T}^{2} / \left(2 W\right)$', md_content) + def test_mathjax_mock(self): + """测试虚拟mathjax渲染器.""" + chain = ExtractSimpleFactory.create(self.config) + self.assertIsNotNone(chain) + test_data = self.data_json[103] + input_data = DataJson(test_data) + result = chain.extract(input_data) + md_content = result.get_content_list().to_nlp_md() + self.assertIn(r'$(a_n)$', md_content) + self.assertIn(r'$a_n ≤ a_{n+1}$', md_content) + self.assertIn(r'$n \in \mathbb{N}$', md_content) + self.assertIn(r'$\left ( \frac{1}{n} \right ) = (1, \frac{1}{2}, \frac{1}{3}, ..., \frac{1}{n}, \frac{1}{n+1}, ... )$', md_content) + def test_htmlmath_sub_sup(self): - """测试ascii分隔符.""" + """测试htmlmath中的上下标标签.""" chain = ExtractSimpleFactory.create(self.config) self.assertIsNotNone(chain) test_data = self.data_json[102] From 9e03b5114841999615fdc7a279ec0843dd9d31ce Mon Sep 17 00:00:00 2001 From: chupei Date: Fri, 15 Aug 2025 13:58:05 +0800 Subject: [PATCH 2/8] Revert "v3.2.2-released" (#525) --- .../html/recognizer/cc_math/render/mathjax.py | 36 ----------------- .../html/recognizer/cc_math/render/render.py | 1 - .../html/recognizer/cc_math/tag_script.py | 19 +-------- .../extractor/html/recognizer/ccmath.py | 40 +++++++------------ llm_web_kit/extractor/html/recognizer/list.py | 15 ++++--- .../good_data/html/math_mathjax_mock.html | 1 - .../good_data/html_data_input.jsonl | 3 +- .../assets/ccmath/math_class_math.html | 1 - .../assets/ccmath/math_class_math_1.html | 0 .../ccmath/math_class_math_inline_1.html | 8 ---- .../extractor/html/recognizer/test_list.py | 35 ---------------- .../extractor/html/recognizer/test_math.py | 8 ---- .../extractor/test_extractor_chain.py | 19 ++------- 13 files changed, 29 insertions(+), 157 deletions(-) delete mode 100644 tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/math_mathjax_mock.html delete mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_class_math.html delete mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_class_math_1.html delete mode 100644 tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_class_math_inline_1.html diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py b/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py index 06ac62a9..bb309468 100644 --- a/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py +++ b/llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py @@ -533,42 +533,6 @@ def _detect_ascii_math(self, tree: HtmlElement) -> bool: return processascii -class MathJaxRenderMock(MathJaxRender): - """虚拟的MathJax渲染器,用于没有MathJax配置但需要使用MathJax解析逻辑的情况. - - 这个类主要用于处理以下场景: - 1. 网页中没有显式的MathJax配置(如 katex_pattern = re.compile(r'katex.render') node_text = text_strip(text) if katex_pattern.findall(node_text): @@ -34,17 +28,8 @@ def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement, pa target_element = target_elements[0] o_html = element_to_html(target_element) target_element.text = None - wrapped_formula = cm.wrap_math_md(formula_content) - # 转化为ccmath,例子: - # f(a,b,c) = (a^2+b^2+c^2)^3 - new_span = create_new_span([(CCMATH_INLINE, MathType.LATEX)], wrapped_formula, - target_element, math_render, o_html) - # 插入到span标签内,例子: - # + new_span = create_new_span([(CCMATH_INLINE,MathType.LATEX)], cm.wrap_math_md(formula_content), target_element, math_render, o_html) target_element.insert(0, new_span) - - # 处理sript且type为math/tex的节点 - # 例子: elif node.get('type') and 'math/tex' in node.get('type'): tag_math_type_list = cm.get_equation_type(o_html) if not tag_math_type_list: diff --git a/llm_web_kit/extractor/html/recognizer/ccmath.py b/llm_web_kit/extractor/html/recognizer/ccmath.py index 28078250..08021dbf 100644 --- a/llm_web_kit/extractor/html/recognizer/ccmath.py +++ b/llm_web_kit/extractor/html/recognizer/ccmath.py @@ -24,7 +24,6 @@ class MathRecognizer(BaseHTMLElementRecognizer): def __init__(self): super().__init__() self.cm = CCMATH() - self.mathjax_detected = False # 添加检测标记 @override def recognize(self, base_url: str, main_html_lst: List[Tuple[HtmlElement, HtmlElement]], raw_html: str, language:str = 'en') -> List[Tuple[HtmlElement, HtmlElement]]: @@ -123,9 +122,8 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe self.cm.url = base_url tree = cc_html math_render_type = math_render.get_render_type() - self.mathjax_detected = False # 重置标记 - - # process1: node循环逻辑 + # 打印遍历node次数 + # count = 0 for node in iter_node(tree): assert isinstance(node, HtmlElement) original_html = self._element_to_html(node) @@ -136,11 +134,9 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe node.tag == 'span' and node.get('class') in [CSDN.INLINE, CSDN.DISPLAY]): tag_script.process_katex_mathml(self.cm, math_render_type, node) - self.mathjax_detected = True if ZHIHU.DOMAIN in self.cm.url and node.tag == 'span' and node.get('class') == ZHIHU.MATH: tag_script.process_zhihu_custom_tag(self.cm, math_render_type, node) - self.mathjax_detected = True # tag = span, class 为 math-containerm, 或者 mathjax 或者 wp-katex-eq if node.tag == 'span' and node.get('class') and ( @@ -151,50 +147,44 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe 'tex' in node.get('class') ): tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node, parent) - self.mathjax_detected = True + + # script[type="math/tex"] + # if node.tag == 'script' and node.get('type') and 'math/tex' in node.get('type'): + # print('匹配到script标签: ', node.get('type')) + # tag_common_modify.modify_tree(cm, math_render_type, original_html, node, parent) # math tags if node.tag == 'math' or node.tag.endswith(':math'): # print(f"匹配到数学标签: {node.tag}") # print(f"标签内容: {original_html}") tag_math.modify_tree(self.cm, math_render_type, original_html, node, parent) - self.mathjax_detected = True if node.tag == 'mjx-container': tag_mjx.modify_tree(self.cm, math_render, original_html, node) - self.mathjax_detected = True # img中的latex if node.tag == 'img': tag_img.modify_tree(self.cm, math_render_type, original_html, node, parent) - self.mathjax_detected = True # span.katex if node.tag == 'script' or 'math' == node.get('class') or 'katex' == node.get('class'): # print('匹配到script/math/katex标签: ', original_html) tag_script.modify_tree(self.cm, math_render_type, original_html, node, parent) - self.mathjax_detected = True # 只有有渲染器的网站才会走下面文本匹配逻辑 if math_render_type: # 14. 只处理只有一层的p标签 if node.tag == 'p' and len(node.getchildren()) == 0: # print('匹配到p标签: ', original_html) tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node, parent) - self.mathjax_detected = True - # procsee2: mathjax渲染器逻辑 - try: - # case1:有mathjax配置 - if math_render_type == MathRenderType.MATHJAX: - math_render.find_math(tree) - # case2:无Mathjax配置但是开启Mathjax逻辑开关(node循环抽到公式的情况) - elif math_render_type is None and self.mathjax_detected: - from llm_web_kit.extractor.html.recognizer.cc_math.render.mathjax import \ - MathJaxRenderMock - math_render = MathJaxRenderMock() - math_render.find_math(tree) - except Exception as e: - raise HtmlMathMathjaxRenderRecognizerException(f'处理MathjaxRender数学公式失败: {e}') + # 修改:传入tree节点,mathjax方案作为process2,不参与上面process1节点的遍历 + if math_render_type: + try: + if math_render_type == MathRenderType.MATHJAX: + math_render.find_math(tree) + except Exception as e: + raise HtmlMathMathjaxRenderRecognizerException(f'处理MathjaxRender数学公式失败: {e}') + # 保存处理后的html # with open('test20250702_result.html', 'w', encoding='utf-8') as f: # f.write(self._element_to_html(tree)) diff --git a/llm_web_kit/extractor/html/recognizer/list.py b/llm_web_kit/extractor/html/recognizer/list.py index c3599fc8..d91caa23 100644 --- a/llm_web_kit/extractor/html/recognizer/list.py +++ b/llm_web_kit/extractor/html/recognizer/list.py @@ -199,14 +199,13 @@ def __extract_list_item_text_recusive(el: HtmlElement): # item['c'].strip(): 会导致前面处理br标签,添加的\n\n失效 result['c'] = ' '.join(normalize_text_segment(item['c'].strip()) for item in paragraph) return result - # list_item_tags = ('li', 'dd', 'dt', 'ul', 'div', 'p', 'span') - # if child.tag in list_item_tags: - # 去掉if限制条件,允许非标准结构的列表通过 - paragraph = __extract_list_item_text_recusive(child) - if len(paragraph) > 0: - tem_json = json.dumps(paragraph).replace('$br$\"}', '\"}') - new_paragraph = json.loads(tem_json) - text_paragraph.append(new_paragraph) + list_item_tags = ('li', 'dd', 'dt', 'ul', 'div', 'p', 'span') + if child.tag in list_item_tags: + paragraph = __extract_list_item_text_recusive(child) + if len(paragraph) > 0: + tem_json = json.dumps(paragraph).replace('$br$\"}', '\"}') + new_paragraph = json.loads(tem_json) + text_paragraph.append(new_paragraph) for n, item in enumerate(text_paragraph): tem_json = json.dumps(item).replace('$br$', '\\n\\n') diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/math_mathjax_mock.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/math_mathjax_mock.html deleted file mode 100644 index 2928e55a..00000000 --- a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/math_mathjax_mock.html +++ /dev/null @@ -1 +0,0 @@ - Monotone Sequences of Real Numbers - Mathonline
Monotone Sequences of Real Numbers

Monotone Sequences of Real Numbers

We will now look at two new types of sequences, increasing sequences and decreasing sequences.

Definition: A sequence of real numbers $(a_n)$ is said to be Increasing if $a_n ≤ a_{n+1}$ for all $n \in \mathbb{N}$. Similarly, a sequence of real numbers $(a_n)$ is said to be Decreasing if $a_n ≥ a_{n+1}$ for all $n \in \mathbb{N}$. A sequence $(a_n)$ is said to be Monotone or Monotonic if it is either increasing or decreasing.

A sequence $(a_n)$ is said to be Strictly Increasing if $a_n < a_{n+1}$ for all $n \in \mathbb{N}$ and Strictly Decreasing if $a_n > a_{n+1}$ for all $n \in \mathbb{N}$.

For example, consider the sequence $\left ( \frac{1}{n} \right ) = (1, \frac{1}{2}, \frac{1}{3}, ..., \frac{1}{n}, \frac{1}{n+1}, ... )$. We note that $\forall n \in \mathbb{N}$, $n < n+1$ and so $\frac{1}{n} > \frac{1}{n+1}$, and so this sequence is decreasing and hence monotone.

The following graph represents the first 10 terms of the monotonically decreasing sequence $\left ( \frac{1}{n} \right )$:

Screen%20Shot%202014-12-04%20at%203.58.31%20PM.png

One such example of an increasing sequence is the sequence $(n + 2)$. Clearly $\forall n \in \mathbb{N}$, $n + 2 < (n+1) + 2 = n + 3$ (since if not, then $n + 2 ≥ n + 3$ which implies that $0 ≥ 1$, which is a contradiction). The following graph represents the first 10 terms of the monotonically increasing sequence $(n + 2)$:

Screen%20Shot%202014-12-04%20at%204.02.37%20PM.png

From the definition of an increasing and decreasing sequence, we should note that EVERY successive term in the sequence should either be larger than the previous (increasing sequences) or smaller than the previous (decreasing sequences). Therefore the sequence $(1, 2, 1, \frac{1}{2}, \frac{1}{3}, \frac{1}{4}, ...)$ cannot be considered a decreasing sequence as $1 = a_1 \not ≥ a_2 = 2$. From this, we will formulate the following definitions:

Definition: A sequence of real numbers $(a_n)$ is said to be Ultimately Increasing if for some $K \in \mathbb{N}$ we have that $\forall n ≥ K$ then $a_n ≤ a_{n+1}$. Similarly, a sequence of real numbers $(a_n)$ is said to be Ultimately Decreasing if for some $K \in \mathbb{N}$ we have that $\forall n ≥ K$ then $a_n ≥ a_{n+1}$. A sequence $(a_n)$ is said to be Ultimately Monotone or Ultimately Monotonic if for some $K \in \mathbb{N}$, if $n ≥ K$ then $(a_n)$ is either ultimately increasing or ultimately decreasing.

Consider the sequence $(n^2 - 4n + 3) = (0, -1, 0, 3, 8, ...)$. This is an ultimately increasing sequence, since for $n ≥ 2$ we have that $a_n ≤ a_{n+1}$. The following graph represents the first 7 terms of this ultimately increasing sequence:

Screen%20Shot%202014-12-04%20at%204.20.53%20PM.png
Unless otherwise stated, the content of this page is licensed under Creative Commons Attribution-ShareAlike 3.0 License
\ No newline at end of file diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl index 22cebca1..7ccc1eb1 100644 --- a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl +++ b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl @@ -100,5 +100,4 @@ {"track_id": "test_mjx_container", "dataset_name": "test_mjx_container", "url": "https://test.com","data_source_category": "HTML", "path":"testmathjax.html", "file_bytes": 1000, "page_layout_type":"artical", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} {"track_id": "test_word_press", "dataset_name": "test_word_press", "url": "https://test.com","data_source_category": "HTML", "path":"word_press.html", "file_bytes": 1000, "page_layout_type":"artical", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} {"track_id": "test_ascii_delimiter", "dataset_name": "test_ascii_delimiter", "url": "https://montalk.net/notes/342/tuning-forks-and-megalithic-technology","data_source_category": "HTML", "path":"math_test_ascii_delimiter.html", "file_bytes": 1000, "page_layout_type":"artical", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} -{"track_id": "test_htmlmath_sub_sup", "dataset_name": "test_htmlmath_sub_sup", "url": "https://cccbdb.nist.gov/compvibs3.asp?casno=123911&charge=0&method=42&basis=0","data_source_category": "HTML", "path":"math_table_title_htmlmath_sub_sup.html", "file_bytes": 1000, "page_layout_type":"artical", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} -{"track_id": "test_mathjax_mock", "dataset_name": "test_mathjax_mock", "url": "http://mathonline.wikidot.com/monotone-sequences-of-real-numbers","data_source_category": "HTML", "path":"math_mathjax_mock.html", "file_bytes": 1000, "page_layout_type":"artical", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} \ No newline at end of file +{"track_id": "test_htmlmath_sub_sup", "dataset_name": "test_htmlmath_sub_sup", "url": "https://cccbdb.nist.gov/compvibs3.asp?casno=123911&charge=0&method=42&basis=0","data_source_category": "HTML", "path":"math_table_title_htmlmath_sub_sup.html", "file_bytes": 1000, "page_layout_type":"artical", "meta_info": {"input_datetime": "2020-01-01 00:00:00"}} \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_class_math.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_class_math.html deleted file mode 100644 index ec9e8518..00000000 --- a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_class_math.html +++ /dev/null @@ -1 +0,0 @@ - 4.7 Years In Minutes - How Many Minutes Is 4.7 Years?

Unit Converter

Conversion formula

The conversion factor from years to minutes is 525600, which means that 1 year is equal to 525600 minutes:

1 yr = 525600 min

To convert 4.7 years into minutes we have to multiply 4.7 by the conversion factor in order to get the time amount from years to minutes. We can also form a simple proportion to calculate the result:

1 yr → 525600 min

4.7 yr → T(min)

Solve the above proportion to obtain the time T in minutes:

T(min) = 4.7 yr × 525600 min

T(min) = 2470320 min

The final result is:

4.7 yr → 2470320 min

We conclude that 4.7 years is equivalent to 2470320 minutes:

4.7 years = 2470320 minutes

4.7 years is equal to 2470320 minutes

Alternative conversion

We can also convert by utilizing the inverse value of the conversion factor. In this case 1 minute is equal to 4.0480585511189E-7 × 4.7 years.

Another way is saying that 4.7 years is equal to 1 ÷ 4.0480585511189E-7 minutes.

Approximate result

For practical purposes we can round our final result to an approximate numerical value. We can say that four point seven years is approximately two million four hundred seventy thousand three hundred twenty minutes:

4.7 yr ≅ 2470320 min

An alternative is also that one minute is approximately zero times four point seven years.

Conversion table

years to minutes chart

For quick reference purposes, below is the conversion table you can use to convert from years to minutes

years (yr) minutes (min)
5.7 years 2995920 minutes
6.7 years 3521520 minutes
7.7 years 4047120 minutes
8.7 years 4572720 minutes
9.7 years 5098320 minutes
10.7 years 5623920 minutes
11.7 years 6149520 minutes
12.7 years 6675120 minutes
13.7 years 7200720 minutes
14.7 years 7726320 minutes

©2020 ConvertOctopus.com

\ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_class_math_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_class_math_1.html deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_class_math_inline_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_class_math_inline_1.html deleted file mode 100644 index cda8dd54..00000000 --- a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/math_class_math_inline_1.html +++ /dev/null @@ -1,8 +0,0 @@ -1 yr = 525600 min -1 yr → 525600 min -4.7 yr → T -T -T -4.7 yr → 2470320 min -4.7 years = 2470320 minutes -4.7 yr ≅ 2470320 min \ No newline at end of file diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_list.py b/tests/llm_web_kit/extractor/html/recognizer/test_list.py index 5f8d61de..dbe79347 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_list.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_list.py @@ -375,38 +375,3 @@ def test_get_attribute_standalone_improved(self): error_msg = str(context.exception) self.assertIn('中没有cclist标签', error_msg) self.assertIn(element.tag, error_msg) - - def test_no_standard_get_list_content_list(self): - """测试非标准结构的list获取content_list.""" - # 获取私有方法 __get_list_content_list - get_list_content_list_method = getattr(self.__list_recognize, '_ListRecognizer__get_list_content_list') - - # 创建测试数据 - test_elements = [ - html_to_element('''''') - ] - - for i, element in enumerate(test_elements): - list_content_list = get_list_content_list_method(element, 1) - assert len(list_content_list) == 3 diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_math.py b/tests/llm_web_kit/extractor/html/recognizer/test_math.py index 6069c590..f51c1869 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_math.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_math.py @@ -207,14 +207,6 @@ 'base_url': 'https://physicshelpforum.com/t/latex-upgrade-physics-forum-powered-by-mathjax-v3.17489/', 'expected': 'assets/ccmath/math_physicsforums_2_1.html', 'expected_inline': 'assets/ccmath/math_physicsforums_2_inline_1.html' - }, - { - 'input': [ - 'assets/ccmath/math_class_math.html', - ], - 'base_url': 'https://convertoctopus.com/4-7-years-to-minutes', - 'expected': 'assets/ccmath/math_class_math_1.html', - 'expected_inline': 'assets/ccmath/math_class_math_inline_1.html' } ] diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index 7f1bf8c9..dc53e015 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -64,7 +64,7 @@ def setUp(self): continue self.data_json.append(json.loads(line)) - assert len(self.data_json) == 104 + assert len(self.data_json) == 103 # Config for HTML extraction self.config = load_pipe_tpl('html-test') @@ -810,27 +810,16 @@ def test_ascii_delimiter(self): input_data = DataJson(test_data) result = chain.extract(input_data) md_content = result.get_content_list().to_nlp_md() + # with open('mathjax抽取case222.md', 'w', encoding='utf-8') as f: + # f.write(md_content) self.assertIn(r'$f = \frac{1}{T} ^ 2 \sqrt{\frac{A E}{\rho}}$', md_content) self.assertIn(r'${m}^{2}$', md_content) self.assertIn(r'\rho$', md_content) self.assertIn(r'$f = \frac{1}{2 L} \sqrt{\frac{E}{\rho}}$', md_content) self.assertIn(r'$L = {T}^{2} / \left(2 W\right)$', md_content) - def test_mathjax_mock(self): - """测试虚拟mathjax渲染器.""" - chain = ExtractSimpleFactory.create(self.config) - self.assertIsNotNone(chain) - test_data = self.data_json[103] - input_data = DataJson(test_data) - result = chain.extract(input_data) - md_content = result.get_content_list().to_nlp_md() - self.assertIn(r'$(a_n)$', md_content) - self.assertIn(r'$a_n ≤ a_{n+1}$', md_content) - self.assertIn(r'$n \in \mathbb{N}$', md_content) - self.assertIn(r'$\left ( \frac{1}{n} \right ) = (1, \frac{1}{2}, \frac{1}{3}, ..., \frac{1}{n}, \frac{1}{n+1}, ... )$', md_content) - def test_htmlmath_sub_sup(self): - """测试htmlmath中的上下标标签.""" + """测试ascii分隔符.""" chain = ExtractSimpleFactory.create(self.config) self.assertIsNotNone(chain) test_data = self.data_json[102] From 329439b37c390fcae4cdee37132657c17e3603c6 Mon Sep 17 00:00:00 2001 From: quyuan Date: Fri, 15 Aug 2025 06:13:25 +0000 Subject: [PATCH 3/8] Update version.py with new version --- llm_web_kit/libs/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm_web_kit/libs/version.py b/llm_web_kit/libs/version.py index b50da94d..29e4a941 100644 --- a/llm_web_kit/libs/version.py +++ b/llm_web_kit/libs/version.py @@ -1 +1 @@ -__version__ = '3.2.1' +__version__ = '3.2.2' From fe1a2ec1e0df4cbf85d4e1d17e1224858231e3a8 Mon Sep 17 00:00:00 2001 From: quyuan Date: Fri, 22 Aug 2025 12:32:52 +0000 Subject: [PATCH 4/8] Update version.py with new version --- llm_web_kit/libs/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm_web_kit/libs/version.py b/llm_web_kit/libs/version.py index 29e4a941..32206102 100644 --- a/llm_web_kit/libs/version.py +++ b/llm_web_kit/libs/version.py @@ -1 +1 @@ -__version__ = '3.2.2' +__version__ = '3.2.3' From 6668e692340a8b1d150982f093069aecc81a01e8 Mon Sep 17 00:00:00 2001 From: quyuan Date: Mon, 25 Aug 2025 03:37:07 +0000 Subject: [PATCH 5/8] Update version.py with new version --- llm_web_kit/libs/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm_web_kit/libs/version.py b/llm_web_kit/libs/version.py index 32206102..d6497a81 100644 --- a/llm_web_kit/libs/version.py +++ b/llm_web_kit/libs/version.py @@ -1 +1 @@ -__version__ = '3.2.3' +__version__ = '4.0.0' From 85a91edaa71a40262002cb351b4d56bac5476d28 Mon Sep 17 00:00:00 2001 From: quyuan Date: Mon, 25 Aug 2025 11:52:47 +0000 Subject: [PATCH 6/8] Update version.py with new version --- llm_web_kit/libs/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm_web_kit/libs/version.py b/llm_web_kit/libs/version.py index d6497a81..1a3bef53 100644 --- a/llm_web_kit/libs/version.py +++ b/llm_web_kit/libs/version.py @@ -1 +1 @@ -__version__ = '4.0.0' +__version__ = '4.0.1' From 21b3622ab004951ff8b06a77580b7e909c00e394 Mon Sep 17 00:00:00 2001 From: quyuan Date: Thu, 11 Sep 2025 11:48:59 +0000 Subject: [PATCH 7/8] Update version.py with new version --- llm_web_kit/libs/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm_web_kit/libs/version.py b/llm_web_kit/libs/version.py index 1a3bef53..fa721b49 100644 --- a/llm_web_kit/libs/version.py +++ b/llm_web_kit/libs/version.py @@ -1 +1 @@ -__version__ = '4.0.1' +__version__ = '4.1.0' From 2b99160d9710b5f628525cc9f8b7e95a1f100833 Mon Sep 17 00:00:00 2001 From: chupei Date: Thu, 11 Sep 2025 20:01:32 +0800 Subject: [PATCH 8/8] update pydantic requirement --- requirements/dev.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements/dev.txt b/requirements/dev.txt index ffe7e23f..23380654 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -4,7 +4,6 @@ nbstripout==0.8.1 nltk==3.8.1 openai==1.75.0 pre-commit==3.8.0 -pydantic==2.10.6 pytest==8.3.3 # coverage tools pytest-cov==6.0.0