Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion bench/data/groundtruth/math_katex_latex_1.jsonl

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion bench/data/groundtruth/math_katex_latex_3.jsonl

Large diffs are not rendered by default.

19 changes: 2 additions & 17 deletions llm_web_kit/extractor/html/recognizer/ccmath.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,6 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe
self.cm.url = base_url
tree = cc_html
math_render_type = math_render.get_render_type()
self.mathjax_detected = False # 重置标记

# process1: node循环逻辑
for node in iter_node(tree):
Expand All @@ -136,11 +135,9 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe
node.tag == 'span' and
node.get('class') in [CSDN.INLINE, CSDN.DISPLAY]):
tag_script.process_katex_mathml(self.cm, math_render_type, node)
self.mathjax_detected = True

if ZHIHU.DOMAIN in self.cm.url and node.tag == 'span' and node.get('class') == ZHIHU.MATH:
tag_script.process_zhihu_custom_tag(self.cm, math_render_type, node)
self.mathjax_detected = True

# tag = span, class 为 math-containerm, 或者 mathjax 或者 wp-katex-eq
if node.tag == 'span' and node.get('class') and (
Expand All @@ -151,44 +148,32 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe
'tex' in node.get('class')
):
tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node, parent)
self.mathjax_detected = True

# math tags
if node.tag == 'math' or node.tag.endswith(':math'):
# print(f"匹配到数学标签: {node.tag}")
# print(f"标签内容: {original_html}")
tag_math.modify_tree(self.cm, math_render_type, original_html, node, parent)
self.mathjax_detected = True

if node.tag == 'mjx-container':
tag_mjx.modify_tree(self.cm, math_render, original_html, node)
self.mathjax_detected = True

# img中的latex
if node.tag == 'img':
tag_img.modify_tree(self.cm, math_render_type, original_html, node, parent)
self.mathjax_detected = True

# span.katex
if node.tag == 'script' or 'math' == node.get('class') or 'katex' == node.get('class'):
# print('匹配到script/math/katex标签: ', original_html)
tag_script.modify_tree(self.cm, math_render_type, original_html, node, parent)
self.mathjax_detected = True
# 只有有渲染器的网站才会走下面文本匹配逻辑
if math_render_type:
# 14. 只处理只有一层的p标签
if node.tag == 'p' and len(node.getchildren()) == 0:
# print('匹配到p标签: ', original_html)
tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node, parent)
self.mathjax_detected = True

# procsee2: mathjax渲染器逻辑
try:
# case1:有mathjax配置
if math_render_type == MathRenderType.MATHJAX:
math_render.find_math(tree)
# case2:无Mathjax配置但是开启Mathjax逻辑开关(node循环抽到公式的情况)
elif math_render_type is None and self.mathjax_detected:
# case2:其他情况默认开启 Mathjax配置
else:
from llm_web_kit.extractor.html.recognizer.cc_math.render.mathjax import \
MathJaxRenderMock
math_render = MathJaxRenderMock()
Expand Down
3 changes: 3 additions & 0 deletions llm_web_kit/extractor/html/recognizer/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,9 @@ def __simplify_td_th_content(self, table_nest_level, elem: HtmlElement) -> None:
else:
math_res = self.__check_table_include_math_code(elem)
math_res_text = ' '.join(normalize_text_segment(item) for item in math_res)
# 清除math和code元素
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里原来的逻辑忘记清理了,下面的clear了,不过好像下面的没有限制tag,未来还是要看下

if any(child.tag in [CCTag.CC_MATH_INLINE, CCTag.CC_MATH_INTERLINE, CCTag.CC_CODE, CCTag.CC_CODE_INLINE] for child in elem.iterchildren()):
elem.clear()
elem.text = math_res_text
else:
math_res = self.__check_table_include_math_code(elem)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -673,7 +673,7 @@ <h1 id="" itemprop="headline">If Cowboys want to lock up Tony Romo, they will ha

</div>
<div class="article-body">
<p>On why the Cowboys haven’t extended Tony #Romo yet#: “The problem with Tony Romo is this. His cap number is $16.8 million. The only way you can reduce that is by extending the contract, but here is the problem. Next year, if he becomes a franchise quarterback if they use the franchise tag, they will have to tender him a 1-year deal in the amount of $21.6 million. If you’re the agent for Tony Romo, you say ok we’ll do a long-term deal, but we have to use these numbers. $16.8 million this year, $20.16 million next year and then the following year it goes up another 20 percent to $24 or 25 million. You’re looking at at least $60 million over three years if they want to lock Tony Romo up. That’s the dance that’s going on between the Cowboys and Tony Romo. If Romo is willing to keep the risk of injury, if he’s willing to roll the dice and go through this year and see what happens, the Cowboys could be backed into a corner next year.” On why the Cowboys haven’t made moves in free agency: “The problem is you can’t do anything without creating cap space, and you’re not going to create cap space without cutting guys unless you extend Anthony Spencer’s deal. Now the problem is moving to the 4-3. His agent Jordan Woy is going to be saying you need to be paying him more like a 4-3 defensive not a 3-4 outside linebacker because a 4-3 defensive end makes more money.”</p><aside>
<p>On why the Cowboys haven’t extended Tony #Romo yet#: “The problem with Tony Romo is this. His cap number is $16.8 million. ”</p><aside>
<div class="aside relatedSidebar">
<div class="titlebar"><h1>Related</h1></div>
<div class="related keywordRelated">
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
\begin{vmatrix}\mathbf{i} & \mathbf{j} & \mathbf{k} \\\frac{\partial X}{\partial u} & \frac{\partial Y}{\partial u} & 0 \\\frac{\partial X}{\partial v} & \frac{\partial Y}{\partial v} & 0\end{vmatrix}
5 changes: 5 additions & 0 deletions tests/llm_web_kit/extractor/html/recognizer/test_math.py
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,11 @@ def test_math_recognizer_html(self):
parts = [part[0] for part in new_parts if CCTag.CC_MATH_INTERLINE in part[0]]
expect_text = base_dir.joinpath(test_case['expected']).read_text(encoding='utf-8').strip()
expect_formulas = [formula for formula in expect_text.split('\n') if formula]
if len(parts) != len(expect_formulas):
print("出错样例:", test_case['input'])
print("期望公式数:", len(expect_formulas), "实际公式数:", len(parts))
print("期望公式:", expect_formulas)
print("实际公式:", parts)
self.assertEqual(len(parts), len(expect_formulas))
# answers = []
for expect, part in zip(expect_formulas, parts):
Expand Down
95 changes: 95 additions & 0 deletions tests/llm_web_kit/simple/test_simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -588,6 +588,101 @@ def test_extract_main_html_with_script(self):
self.assertIn('B. How does the TV advertising campaign initiated by IKEA overcome the entry barrier of high advertising expenditures?', md)
self.assertIn('Johansson, J. K. (2006). Global marketing (4th edition ed.). New York: McGraw Hill Irwin.', md)

def test_extract_main_html_with_mathjax(self):
"""测试包含MathJax数学公式的HTML内容提取."""

main_html = r'''
<html><body>
<div class="options-div-0-0 option-box__items" style="display: none;">
<span class="bedroom-rate__title">Room Only Rate</span>
<span class="bedroom-rate__price">£1,230.00</span>
</div>
<p cc-select="true" class="mark-selected" data-anno-uid="anno-uid-wygjielbjln" style="">Are the filtrations after these steps. Why only these? Looking at $\mathcal{F}_1$, we can obtain probabilities for the following events:</p>
<p>正常内容</p>
</body></html>
'''

md = extract_content_from_main_html(self.url, main_html)

# 验证MathJax数学公式被正确提取
self.assertIn('$\\mathcal{F}_1$', md)
self.assertIn('Are the filtrations after these steps', md)
self.assertIn('正常内容', md)

# 验证隐藏内容被过滤掉了
self.assertNotIn('Room Only Rate', md)
self.assertNotIn('£1,230.00', md)

# 验证JavaScript代码被过滤掉了
self.assertNotIn('MathJax=', md)
self.assertNotIn('processEscapes', md)

def test_extract_main_html_with_table_with_math(self):
"""测试html中包含table且table中包含数学公式的HTML内容提取."""
main_html = r'''
<h2>Character values</h2>
<p>We give the values of \(\chi\) on generators for \(\left(\mathbb{Z}/3332\mathbb{Z}\right)^\times\).</p>
<table class="ntdata">
<tbody>
<tr>
<td class="dark border-right border-bottom">\(n\)</td>
<td class="light border-bottom">\(785\)</td>
<td class="dark border-bottom">\(885\)</td>
<td class="light border-bottom">\(1667\)</td> </tr>
<tr>
<td class="dark border-right">\(\chi(n)\)</td>
<td class="light">\(e\left(\frac{3}{4}\right)\)</td>
<td class="dark">\(e\left(\frac{2}{3}\right)\)</td>
<td class="light">\(-1\)</td> </tr>
</tbody>
</table>

<a name="coefficient_data"></a>
<h2>Coefficient data</h2>

<p>For each \(n\) we display the coefficients of the \(q\)-expansion \(a_n\), the
<a title="Satake parameters [cmf.satake_parameters]" knowl="cmf.satake_parameters" kwargs="">Satake parameters</a> \(\alpha_p\),
and the Satake angles \(\theta_p = \textrm{Arg}(\alpha_p)\).</p>
'''

md = extract_content_from_main_html(self.url, main_html)

# 验证MathJax数学公式被正确提取
self.assertIn('We give the values of $\\chi$ on generators for $\\left(\\mathbb{Z}/3332\\mathbb{Z}\\right)^\\times$ ', md)
self.assertIn('| $n$ | $785$ | $885$ | $1667$ |', md)
self.assertIn('| $\\chi(n)$ | $e\\left(\\frac{3}{4}\\right)$ | $e\\left(\\frac{2}{3}\\right)$ | $-1$ |', md)

def test_extract_magic_html_with_mathjax(self):
"""测试包含MathJax数学公式的HTML内容提取."""
raw_html = r'''
<html>
<meta charset="utf-8"><meta content="IE=edge" http-equiv="X-UA-Compatible"><meta content="width=device-width,initial-scale=1,shrink-to-fit=no" name="viewport">
<script>MathJax={tex:{inlineMath:[["$","$"],["\\(","\\)"]],processEscapes:!0},svg:{fontCache:"global"}}</script><script async="" id="MathJax-script" src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-svg.js" type="text/javascript"></script>
<body>
<div class="options-div-0-0 option-box__items" style="display: none;">
<span class="bedroom-rate__title">Room Only Rate</span>
<span class="bedroom-rate__price">£1,230.00</span>
</div>
<p cc-select="true" class="mark-selected" data-anno-uid="anno-uid-wygjielbjln" style="">Are the filtrations after these steps. Why only these? Looking at $\mathcal{F}_1$, we can obtain probabilities for the following events:</p>
<p>正常内容</p>
</body></html>
'''

md = extract_content_from_html_with_magic_html(self.url, raw_html)

# 验证MathJax数学公式被正确提取
self.assertIn('$\\mathcal{F}_1$', md)
self.assertIn('Are the filtrations after these steps', md)
self.assertIn('正常内容', md)

# 验证隐藏内容被过滤掉了
self.assertNotIn('Room Only Rate', md)
self.assertNotIn('£1,230.00', md)

# 验证JavaScript代码被过滤掉了
self.assertNotIn('MathJax=', md)
self.assertNotIn('processEscapes', md)


if __name__ == '__main__':
unittest.main(verbosity=2)