diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c0c19fe1..e4eab679 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -51,11 +51,11 @@ repos: - mdformat_frontmatter - linkify-it-py exclude: '^tests/.*/assets/|llm_web_kit/model/assets/.*' - - repo: https://github.com/myint/docformatter - rev: v1.3.1 - hooks: - - id: docformatter - args: [ "--in-place", "--wrap-descriptions", "119" ] + # - repo: https://github.com/myint/docformatter + # rev: v1.3.1 + # hooks: + # - id: docformatter + # args: [ "--in-place", "--wrap-descriptions", "119" ] - repo: local hooks: - id: clear-jupyter-notebook-output diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/tag_img.py b/llm_web_kit/extractor/html/recognizer/cc_math/tag_img.py index e1f500c9..ae9a8700 100644 --- a/llm_web_kit/extractor/html/recognizer/cc_math/tag_img.py +++ b/llm_web_kit/extractor/html/recognizer/cc_math/tag_img.py @@ -1,3 +1,4 @@ +import re from urllib.parse import unquote from lxml.html import HtmlElement @@ -45,7 +46,10 @@ def is_display_mode(node, src_name): return True # 4. 检查图片尺寸 - if node.get('width') and int(node.get('width', '0')) > 100: + width_str = node.get('width', '') + # 提取数字部分,处理带单位的情况(如 "100px") + width_match = re.match(r'^(\d+)', width_str) + if width_match and int(width_match.group(1)) > 100: return True # 5. 检查是否后面紧跟
标签 diff --git a/tests/llm_web_kit/simple/test_simple.py b/tests/llm_web_kit/simple/test_simple.py index 442af4a2..d5305894 100644 --- a/tests/llm_web_kit/simple/test_simple.py +++ b/tests/llm_web_kit/simple/test_simple.py @@ -718,6 +718,56 @@ def test_extract_main_html_with_table_with_math(self): self.assertIn('| $n$ | $785$ | $885$ | $1667$ |', md) self.assertIn('| $\\chi(n)$ | $e\\left(\\frac{3}{4}\\right)$ | $e\\left(\\frac{2}{3}\\right)$ | $-1$ |', md) + def test_extract_main_html_with_math_img_width_various_formats(self): + """测试img标签width属性各种格式的情况,验证不会抛出异常.""" + main_html = r''' +

Some text with inline formula:

+ + $E=mc^2$ +

And a larger image:

+ + large image +

Image with percent width:

+ + $a^2+b^2=c^2$ +

Image with float width:

+ + $x^n$ +

Image with float width and unit:

+ + $y^m$ +

Image with auto width:

+ + $z^k$ +

Image with em unit:

+ + $w^j$ +

Image with empty width:

+ + $v^i$ + ''' + + # 这个测试主要验证不会因为各种 width 值而抛出异常 + md = extract_content_from_main_html(self.url, main_html) + print(md) + + # 验证文本内容存在 + self.assertIn('Some text with inline formula', md) + self.assertIn('Image with float width', md) + self.assertIn('Image with auto width', md) + + # 验证 img 中的数学公式被正确提取 + # width <= 100 的是行内公式 $...$ + self.assertIn('$E=mc^2$', md) # width="50px", 50 <= 100 + self.assertIn('$a^2+b^2=c^2$', md) # width="80%", 80 <= 100 + self.assertIn('$z^k$', md) # width="auto", 无数字 + self.assertIn('$w^j$', md) # width="10em", 10 <= 100 + self.assertIn('$v^i$', md) # width="", 空值 + + # width > 100 的是行间公式 $$...$$ (多行格式) + self.assertIn('$$\nx^n\n$$', md) # width="512.123", 512 > 100 + self.assertIn('$$\ny^m\n$$', md) # width="123.456px", 123 > 100 + def test_extract_magic_html_with_mathjax(self): """测试包含MathJax数学公式的HTML内容提取.""" raw_html = r''' @@ -752,3 +802,4 @@ def test_extract_magic_html_with_mathjax(self): if __name__ == '__main__': unittest.main(verbosity=2) + TestSimple().test_extract_main_html_with_math_img_width_various_formats()