ccprocessor · drunkpig · Sep 11, 2025 · Sep 10, 2025 · Sep 10, 2025 · Sep 10, 2025
diff --git a/bench/data/groundtruth/math_katex_latex_1.jsonl b/bench/data/groundtruth/math_katex_latex_1.jsonl
diff --git a/bench/data/groundtruth/math_katex_latex_3.jsonl b/bench/data/groundtruth/math_katex_latex_3.jsonl
diff --git a/llm_web_kit/extractor/html/recognizer/ccmath.py b/llm_web_kit/extractor/html/recognizer/ccmath.py
@@ -123,7 +123,6 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe
             self.cm.url = base_url
             tree = cc_html
             math_render_type = math_render.get_render_type()
-            self.mathjax_detected = False  # 重置标记
 
             # process1: node循环逻辑
             for node in iter_node(tree):
@@ -136,11 +135,9 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe
                         node.tag == 'span' and
                         node.get('class') in [CSDN.INLINE, CSDN.DISPLAY]):
                     tag_script.process_katex_mathml(self.cm, math_render_type, node)
-                    self.mathjax_detected = True
 
                 if ZHIHU.DOMAIN in self.cm.url and node.tag == 'span' and node.get('class') == ZHIHU.MATH:
                     tag_script.process_zhihu_custom_tag(self.cm, math_render_type, node)
-                    self.mathjax_detected = True
 
                 # tag = span， class 为 math-containerm， 或者 mathjax 或者 wp-katex-eq
                 if node.tag == 'span' and node.get('class') and (
@@ -151,44 +148,32 @@ def process_ccmath_html(self, cc_html: str, o_html: str, math_render: BaseMathRe
                         'tex' in node.get('class')
                 ):
                     tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node, parent)
-                    self.mathjax_detected = True
 
                 # math tags
                 if node.tag == 'math' or node.tag.endswith(':math'):
                     # print(f"匹配到数学标签: {node.tag}")
                     # print(f"标签内容: {original_html}")
                     tag_math.modify_tree(self.cm, math_render_type, original_html, node, parent)
-                    self.mathjax_detected = True
 
                 if node.tag == 'mjx-container':
                     tag_mjx.modify_tree(self.cm, math_render, original_html, node)
-                    self.mathjax_detected = True
 
                 # img中的latex
                 if node.tag == 'img':
                     tag_img.modify_tree(self.cm, math_render_type, original_html, node, parent)
-                    self.mathjax_detected = True
 
                 # span.katex
                 if node.tag == 'script' or 'math' == node.get('class') or 'katex' == node.get('class'):
                     # print('匹配到script/math/katex标签: ', original_html)
                     tag_script.modify_tree(self.cm, math_render_type, original_html, node, parent)
-                    self.mathjax_detected = True
-                # 只有有渲染器的网站才会走下面文本匹配逻辑
-                if math_render_type:
-                    # 14. 只处理只有一层的p标签
-                    if node.tag == 'p' and len(node.getchildren()) == 0:
-                        # print('匹配到p标签: ', original_html)
-                        tag_common_modify.modify_tree(self.cm, math_render_type, original_html, node, parent)
-                        self.mathjax_detected = True
 
             # procsee2: mathjax渲染器逻辑
             try:
                 # case1：有mathjax配置
                 if math_render_type == MathRenderType.MATHJAX:
                     math_render.find_math(tree)
-                # case2：无Mathjax配置但是开启Mathjax逻辑开关（node循环抽到公式的情况）
-                elif math_render_type is None and self.mathjax_detected:
+                # case2：其他情况默认开启 Mathjax配置
+                else:
                     from llm_web_kit.extractor.html.recognizer.cc_math.render.mathjax import \
                         MathJaxRenderMock
                     math_render = MathJaxRenderMock()

diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py
@@ -267,6 +267,9 @@ def __simplify_td_th_content(self, table_nest_level, elem: HtmlElement) -> None:
             else:
                 math_res = self.__check_table_include_math_code(elem)
                 math_res_text = ' '.join(normalize_text_segment(item) for item in math_res)
+                # 清除math和code元素
+                if any(child.tag in [CCTag.CC_MATH_INLINE, CCTag.CC_MATH_INTERLINE, CCTag.CC_CODE, CCTag.CC_CODE_INLINE] for child in elem.iterchildren()):
+                    elem.clear()
                 elem.text = math_res_text
         else:
             math_res = self.__check_table_include_math_code(elem)

diff --git a/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/math_dollar.html b/tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/math_dollar.html
@@ -673,7 +673,7 @@ <h1 id="" itemprop="headline">If Cowboys want to lock up Tony Romo, they will ha
 
 </div>
             <div class="article-body">
-                <p>On why the Cowboys haven’t extended Tony #Romo yet#: “The problem with Tony Romo is this. His cap number is $16.8 million. The only way you can reduce that is by extending the contract, but here is the problem. Next year, if he becomes a franchise quarterback if they use the franchise tag, they will have to tender him a 1-year deal in the amount of $21.6 million. If you’re the agent for Tony Romo, you say ok we’ll do a long-term deal, but we have to use these numbers. $16.8 million this year, $20.16 million next year and then the following year it goes up another 20 percent to $24 or 25 million. You’re looking at at least $60 million over three years if they want to lock Tony Romo up. That’s the dance that’s going on between the Cowboys and Tony Romo. If Romo is willing to keep the risk of injury, if he’s willing to roll the dice and go through this year and see what happens, the Cowboys could be backed into a corner next year.” On why the Cowboys haven’t made moves in free agency: “The problem is you can’t do anything without creating cap space, and you’re not going to create cap space without cutting guys unless you extend Anthony Spencer’s deal. Now the problem is moving to the 4-3. His agent Jordan Woy is going to be saying you need to be paying him more like a 4-3 defensive not a 3-4 outside linebacker because a 4-3 defensive end makes more money.”</p><aside>
+                <p>On why the Cowboys haven’t extended Tony #Romo yet#: “The problem with Tony Romo is this. His cap number is $16.8 million. ”</p><aside>
                 <div class="aside relatedSidebar">
                     <div class="titlebar"><h1>Related</h1></div>
                     <div class="related keywordRelated">

diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/katex_mathjax_1.html b/tests/llm_web_kit/extractor/html/recognizer/assets/ccmath/katex_mathjax_1.html
@@ -0,0 +1 @@
+\begin{vmatrix}\mathbf{i} & \mathbf{j} & \mathbf{k} \\\frac{\partial X}{\partial u} &  \frac{\partial Y}{\partial u} & 0 \\\frac{\partial X}{\partial v} &  \frac{\partial Y}{\partial v} & 0\end{vmatrix}
diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_math.py b/tests/llm_web_kit/extractor/html/recognizer/test_math.py
@@ -492,6 +492,11 @@ def test_math_recognizer_html(self):
             parts = [part[0] for part in new_parts if CCTag.CC_MATH_INTERLINE in part[0]]
             expect_text = base_dir.joinpath(test_case['expected']).read_text(encoding='utf-8').strip()
             expect_formulas = [formula for formula in expect_text.split('\n') if formula]
+            if len(parts) != len(expect_formulas):
+                print("出错样例：", test_case['input'])
+                print("期望公式数：", len(expect_formulas), "实际公式数：", len(parts))
+                print("期望公式：", expect_formulas)
+                print("实际公式：", parts)
             self.assertEqual(len(parts), len(expect_formulas))
             # answers = []
             for expect, part in zip(expect_formulas, parts):

diff --git a/tests/llm_web_kit/simple/test_simple.py b/tests/llm_web_kit/simple/test_simple.py
@@ -588,6 +588,101 @@ def test_extract_main_html_with_script(self):
         self.assertIn('B. How does the TV advertising campaign initiated by IKEA overcome the entry barrier of high advertising expenditures?', md)
         self.assertIn('Johansson, J. K. (2006). Global marketing (4th edition ed.). New York: McGraw Hill Irwin.', md)
 
+    def test_extract_main_html_with_mathjax(self):
+        """测试包含MathJax数学公式的HTML内容提取."""
+
+        main_html = r'''
+        <html><body>
+        <div class="options-div-0-0 option-box__items" style="display: none;">
+            <span class="bedroom-rate__title">Room Only Rate</span>
+            <span class="bedroom-rate__price">£1,230.00</span>
+        </div>
+        <p cc-select="true" class="mark-selected" data-anno-uid="anno-uid-wygjielbjln" style="">Are the filtrations after these steps. Why only these? Looking at $\mathcal{F}_1$, we can obtain probabilities for the following events:</p>
+        <p>正常内容</p>
+        </body></html>
+        '''
+
+        md = extract_content_from_main_html(self.url, main_html)
+
+        # 验证MathJax数学公式被正确提取
+        self.assertIn('$\\mathcal{F}_1$', md)
+        self.assertIn('Are the filtrations after these steps', md)
+        self.assertIn('正常内容', md)
+
+        # 验证隐藏内容被过滤掉了
+        self.assertNotIn('Room Only Rate', md)
+        self.assertNotIn('£1,230.00', md)
+
+        # 验证JavaScript代码被过滤掉了
+        self.assertNotIn('MathJax=', md)
+        self.assertNotIn('processEscapes', md)
+
+    def test_extract_main_html_with_table_with_math(self):
+        """测试html中包含table且table中包含数学公式的HTML内容提取."""
+        main_html = r'''
+        <h2>Character values</h2>
+        <p>We give the values of \(\chi\) on generators for \(\left(\mathbb{Z}/3332\mathbb{Z}\right)^\times\).</p>
+        <table class="ntdata">
+        <tbody>
+                <tr>
+            <td class="dark border-right border-bottom">\(n\)</td>
+            <td class="light border-bottom">\(785\)</td>
+            <td class="dark border-bottom">\(885\)</td>
+            <td class="light border-bottom">\(1667\)</td>    </tr>
+            <tr>
+            <td class="dark border-right">\(\chi(n)\)</td>
+            <td class="light">\(e\left(\frac{3}{4}\right)\)</td>
+            <td class="dark">\(e\left(\frac{2}{3}\right)\)</td>
+            <td class="light">\(-1\)</td>    </tr>
+        </tbody>
+        </table>
+
+        <a name="coefficient_data"></a>
+        <h2>Coefficient data</h2>
+
+        <p>For each \(n\) we display the coefficients of the \(q\)-expansion \(a_n\), the
+        <a title="Satake parameters [cmf.satake_parameters]" knowl="cmf.satake_parameters" kwargs="">Satake parameters</a> \(\alpha_p\),
+        and the Satake angles \(\theta_p = \textrm{Arg}(\alpha_p)\).</p>
+        '''
+
+        md = extract_content_from_main_html(self.url, main_html)
+
+        # 验证MathJax数学公式被正确提取
+        self.assertIn('We give the values of $\\chi$ on generators for $\\left(\\mathbb{Z}/3332\\mathbb{Z}\\right)^\\times$ ', md)
+        self.assertIn('| $n$ | $785$ | $885$ | $1667$ |', md)
+        self.assertIn('| $\\chi(n)$ | $e\\left(\\frac{3}{4}\\right)$ | $e\\left(\\frac{2}{3}\\right)$ | $-1$ |', md)
+
+    def test_extract_magic_html_with_mathjax(self):
+        """测试包含MathJax数学公式的HTML内容提取."""
+        raw_html = r'''
+        <html>
+        <meta charset="utf-8"><meta content="IE=edge" http-equiv="X-UA-Compatible"><meta content="width=device-width,initial-scale=1,shrink-to-fit=no" name="viewport">
+        <script>MathJax={tex:{inlineMath:[["$","$"],["\\(","\\)"]],processEscapes:!0},svg:{fontCache:"global"}}</script><script async="" id="MathJax-script" src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-svg.js" type="text/javascript"></script>
+        <body>
+        <div class="options-div-0-0 option-box__items" style="display: none;">
+            <span class="bedroom-rate__title">Room Only Rate</span>
+            <span class="bedroom-rate__price">£1,230.00</span>
+        </div>
+        <p cc-select="true" class="mark-selected" data-anno-uid="anno-uid-wygjielbjln" style="">Are the filtrations after these steps. Why only these? Looking at $\mathcal{F}_1$, we can obtain probabilities for the following events:</p>
+        <p>正常内容</p>
+        </body></html>
+        '''
+
+        md = extract_content_from_html_with_magic_html(self.url, raw_html)
+
+        # 验证MathJax数学公式被正确提取
+        self.assertIn('$\\mathcal{F}_1$', md)
+        self.assertIn('Are the filtrations after these steps', md)
+        self.assertIn('正常内容', md)
+
+        # 验证隐藏内容被过滤掉了
+        self.assertNotIn('Room Only Rate', md)
+        self.assertNotIn('£1,230.00', md)
+
+        # 验证JavaScript代码被过滤掉了
+        self.assertNotIn('MathJax=', md)
+        self.assertNotIn('processEscapes', md)
+
 
 if __name__ == '__main__':
     unittest.main(verbosity=2)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		\begin{vmatrix}\mathbf{i} & \mathbf{j} & \mathbf{k} \\\frac{\partial X}{\partial u} & \frac{\partial Y}{\partial u} & 0 \\\frac{\partial X}{\partial v} & \frac{\partial Y}{\partial v} & 0\end{vmatrix}