Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,11 @@ repos:
- mdformat_frontmatter
- linkify-it-py
exclude: '^tests/.*/assets/|llm_web_kit/model/assets/.*'
- repo: https://github.com/myint/docformatter
rev: v1.3.1
hooks:
- id: docformatter
args: [ "--in-place", "--wrap-descriptions", "119" ]
# - repo: https://github.com/myint/docformatter
# rev: v1.3.1
# hooks:
# - id: docformatter
# args: [ "--in-place", "--wrap-descriptions", "119" ]
- repo: local
hooks:
- id: clear-jupyter-notebook-output
Expand Down
6 changes: 5 additions & 1 deletion llm_web_kit/extractor/html/recognizer/cc_math/tag_img.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
from urllib.parse import unquote

from lxml.html import HtmlElement
Expand Down Expand Up @@ -45,7 +46,10 @@ def is_display_mode(node, src_name):
return True

# 4. 检查图片尺寸
if node.get('width') and int(node.get('width', '0')) > 100:
width_str = node.get('width', '')
# 提取数字部分,处理带单位的情况(如 "100px")
width_match = re.match(r'^(\d+)', width_str)
if width_match and int(width_match.group(1)) > 100:
return True

# 5. 检查是否后面紧跟<br>标签
Expand Down
51 changes: 51 additions & 0 deletions tests/llm_web_kit/simple/test_simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -718,6 +718,56 @@ def test_extract_main_html_with_table_with_math(self):
self.assertIn('| $n$ | $785$ | $885$ | $1667$ |', md)
self.assertIn('| $\\chi(n)$ | $e\\left(\\frac{3}{4}\\right)$ | $e\\left(\\frac{2}{3}\\right)$ | $-1$ |', md)

def test_extract_main_html_with_math_img_width_various_formats(self):
"""测试img标签width属性各种格式的情况,验证不会抛出异常."""
main_html = r'''
<p>Some text with inline formula:</p>
<!-- 带单位 px -->
<img src="https://latex.codecogs.com/gif.latex?E=mc^2" alt="$E=mc^2$" width="50px" height="20px" />
<p>And a larger image:</p>
<!-- 带单位 px,超过100 -->
<img src="https://example.com/large.png" alt="large image" width="200px" />
<p>Image with percent width:</p>
<!-- 百分比 -->
<img src="https://latex.codecogs.com/gif.latex?a^2+b^2=c^2" alt="$a^2+b^2=c^2$" width="80%" />
<p>Image with float width:</p>
<!-- 浮点数 -->
<img src="https://latex.codecogs.com/gif.latex?x^n" alt="$x^n$" width="512.123" />
<p>Image with float width and unit:</p>
<!-- 浮点数带单位 -->
<img src="https://latex.codecogs.com/gif.latex?y^m" alt="$y^m$" width="123.456px" />
<p>Image with auto width:</p>
<!-- auto 值 -->
<img src="https://latex.codecogs.com/gif.latex?z^k" alt="$z^k$" width="auto" />
<p>Image with em unit:</p>
<!-- em 单位 -->
<img src="https://latex.codecogs.com/gif.latex?w^j" alt="$w^j$" width="10em" />
<p>Image with empty width:</p>
<!-- 空值 -->
<img src="https://latex.codecogs.com/gif.latex?v^i" alt="$v^i$" width="" />
'''

# 这个测试主要验证不会因为各种 width 值而抛出异常
md = extract_content_from_main_html(self.url, main_html)
print(md)

# 验证文本内容存在
self.assertIn('Some text with inline formula', md)
self.assertIn('Image with float width', md)
self.assertIn('Image with auto width', md)

# 验证 img 中的数学公式被正确提取
# width <= 100 的是行内公式 $...$
self.assertIn('$E=mc^2$', md) # width="50px", 50 <= 100
self.assertIn('$a^2+b^2=c^2$', md) # width="80%", 80 <= 100
self.assertIn('$z^k$', md) # width="auto", 无数字
self.assertIn('$w^j$', md) # width="10em", 10 <= 100
self.assertIn('$v^i$', md) # width="", 空值

# width > 100 的是行间公式 $$...$$ (多行格式)
self.assertIn('$$\nx^n\n$$', md) # width="512.123", 512 > 100
self.assertIn('$$\ny^m\n$$', md) # width="123.456px", 123 > 100

def test_extract_magic_html_with_mathjax(self):
"""测试包含MathJax数学公式的HTML内容提取."""
raw_html = r'''
Expand Down Expand Up @@ -752,3 +802,4 @@ def test_extract_magic_html_with_mathjax(self):

if __name__ == '__main__':
unittest.main(verbosity=2)
TestSimple().test_extract_main_html_with_math_img_width_various_formats()