diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index c0c19fe1..e4eab679 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -51,11 +51,11 @@ repos:
- mdformat_frontmatter
- linkify-it-py
exclude: '^tests/.*/assets/|llm_web_kit/model/assets/.*'
- - repo: https://github.com/myint/docformatter
- rev: v1.3.1
- hooks:
- - id: docformatter
- args: [ "--in-place", "--wrap-descriptions", "119" ]
+ # - repo: https://github.com/myint/docformatter
+ # rev: v1.3.1
+ # hooks:
+ # - id: docformatter
+ # args: [ "--in-place", "--wrap-descriptions", "119" ]
- repo: local
hooks:
- id: clear-jupyter-notebook-output
diff --git a/llm_web_kit/extractor/html/recognizer/cc_math/tag_img.py b/llm_web_kit/extractor/html/recognizer/cc_math/tag_img.py
index e1f500c9..ae9a8700 100644
--- a/llm_web_kit/extractor/html/recognizer/cc_math/tag_img.py
+++ b/llm_web_kit/extractor/html/recognizer/cc_math/tag_img.py
@@ -1,3 +1,4 @@
+import re
from urllib.parse import unquote
from lxml.html import HtmlElement
@@ -45,7 +46,10 @@ def is_display_mode(node, src_name):
return True
# 4. 检查图片尺寸
- if node.get('width') and int(node.get('width', '0')) > 100:
+ width_str = node.get('width', '')
+ # 提取数字部分,处理带单位的情况(如 "100px")
+ width_match = re.match(r'^(\d+)', width_str)
+ if width_match and int(width_match.group(1)) > 100:
return True
# 5. 检查是否后面紧跟
标签
diff --git a/tests/llm_web_kit/simple/test_simple.py b/tests/llm_web_kit/simple/test_simple.py
index 442af4a2..d5305894 100644
--- a/tests/llm_web_kit/simple/test_simple.py
+++ b/tests/llm_web_kit/simple/test_simple.py
@@ -718,6 +718,56 @@ def test_extract_main_html_with_table_with_math(self):
self.assertIn('| $n$ | $785$ | $885$ | $1667$ |', md)
self.assertIn('| $\\chi(n)$ | $e\\left(\\frac{3}{4}\\right)$ | $e\\left(\\frac{2}{3}\\right)$ | $-1$ |', md)
+ def test_extract_main_html_with_math_img_width_various_formats(self):
+ """测试img标签width属性各种格式的情况,验证不会抛出异常."""
+ main_html = r'''
+
Some text with inline formula:
+ +And a larger image:
+ +
+ Image with percent width:
+ +Image with float width:
+ +Image with float width and unit:
+ +Image with auto width:
+ +Image with em unit:
+ +Image with empty width:
+ +