diff --git a/llm_web_kit/extractor/html/recognizer/list.py b/llm_web_kit/extractor/html/recognizer/list.py index c3599fc8..61f113b8 100644 --- a/llm_web_kit/extractor/html/recognizer/list.py +++ b/llm_web_kit/extractor/html/recognizer/list.py @@ -1,6 +1,7 @@ import json from typing import Any, List, Tuple +from lxml import html as lxml_html from lxml.html import HtmlElement from overrides import override @@ -8,8 +9,10 @@ from llm_web_kit.extractor.html.recognizer.recognizer import ( BaseHTMLElementRecognizer, CCTag) from llm_web_kit.libs.doc_element_type import DocElementType, ParagraphTextType -from llm_web_kit.libs.html_utils import (html_normalize_space, - process_sub_sup_tags) +from llm_web_kit.libs.html_utils import (html_normalize_space, html_to_element, + process_sub_sup_tags, + replace_sub_sup_with_text_regex, + restore_sub_sup_from_text_regex) from llm_web_kit.libs.text_utils import normalize_text_segment from .text import inline_tags @@ -224,7 +227,9 @@ def __get_list_content_list(self, ele: HtmlElement, list_nest_level: int) -> lis Returns: list: 包含列表项内容的列表,即items """ - + ele_html = lxml_html.tostring(ele, encoding='utf-8').decode() + replace_tree_html = replace_sub_sup_with_text_regex(ele_html) + ele = html_to_element(replace_tree_html) content_list = [] # 处理根元素文本 if ele.text and ele.text.strip(): @@ -239,6 +244,8 @@ def __get_list_content_list(self, ele: HtmlElement, list_nest_level: int) -> lis for child in ele.iterchildren(): text_paragraph = self.__extract_list_item_text(child) if len(text_paragraph) > 0: + json_paragraph = restore_sub_sup_from_text_regex(json.dumps(text_paragraph)) + text_paragraph = json.loads(json_paragraph) content_list.extend(text_paragraph) return content_list diff --git a/llm_web_kit/extractor/html/recognizer/table.py b/llm_web_kit/extractor/html/recognizer/table.py index e4cc78ce..0d6e2b8e 100644 --- a/llm_web_kit/extractor/html/recognizer/table.py +++ b/llm_web_kit/extractor/html/recognizer/table.py @@ -1,6 +1,9 @@ +import re from typing import Any, List, Tuple +from lxml import html from lxml.html import HtmlElement +from lxml.html.clean import Cleaner from overrides import override from llm_web_kit.exception.exception import HtmlTableRecognizerException @@ -8,12 +11,34 @@ from llm_web_kit.extractor.html.recognizer.recognizer import ( BaseHTMLElementRecognizer, CCTag) from llm_web_kit.libs.doc_element_type import DocElementType -from llm_web_kit.libs.html_utils import (html_normalize_space, - process_sub_sup_tags) +from llm_web_kit.libs.html_utils import (element_to_html_unescaped, + html_normalize_space, html_to_element, + process_sub_sup_tags, + replace_sub_sup_with_text_regex, + restore_sub_sup_from_text_regex) from llm_web_kit.libs.text_utils import normalize_text_segment from .text import inline_tags +new_inline_tags = inline_tags.union({'table', 'tr', 'td', 'th', 'thead', 'tbody', 'tfoot', 'caption'}) + +allow_tags = ['table', 'tr', 'td', 'th', 'thead', 'tbody', 'tfoot', 'caption', 'sub', 'sup', 'ccmath-inline', 'ccmath-interline', 'cccode', 'cccode-inline'] + +cleaner = Cleaner( + safe_attrs_only=False, + page_structure=False, + style=True, + scripts=True, + comments=True, + links=False, + meta=True, + embedded=True, + frames=True, + forms=True, + annoying_tags=True, + allow_tags=allow_tags +) + # 空元素 VOID_ELEMENTS = { 'area', 'base', 'br', 'col', 'embed', 'hr', @@ -234,7 +259,7 @@ def process_node(node): if node.tail and node.tail.strip(): result.append(node.tail.strip()) else: - if node.tag == 'br' or node.tag not in inline_tags: + if node.tag == 'br' or node.tag not in new_inline_tags: result.append('\n\n') # 提取当前节点的文本 @@ -247,7 +272,7 @@ def process_node(node): process_node(child) # 处理节点的tail(元素闭合后的文本) if node.tail and node.tail.strip(): - if node.tag not in inline_tags: + if node.tag not in new_inline_tags: result.append('\n\n') cleaned_tail = node.tail.strip() result.append(html_normalize_space(cleaned_tail)) @@ -274,7 +299,10 @@ def __simplify_td_th_content(self, table_nest_level, elem: HtmlElement) -> None: else: math_res = self.__check_table_include_math_code(elem) elem.clear() - math_res_text = ' '.join(normalize_text_segment(item) for item in math_res) + if elem.tag not in new_inline_tags: + math_res_text = ' '.join(normalize_text_segment(item) for item in math_res) + "\n\n" + else: + math_res_text = ' '.join(normalize_text_segment(item) for item in math_res) if elem.tag in VOID_ELEMENTS: elem_pre = elem.getprevious() if elem_pre is not None: @@ -292,22 +320,28 @@ def __get_table_body(self, table_type, table_nest_level, table_root): if table_type == 'empty': content = table_root.text_content() return content + table_html = html.tostring(table_root, encoding='utf-8').decode() + replace_tree_html = replace_sub_sup_with_text_regex(table_html) + table_root = html_to_element(replace_tree_html) + # 清理除了colspan和rowspan之外的属性 self.__simplify_td_th_content(table_nest_level, table_root) table_clean_attributes(table_root) + clean_html = cleaner.clean_html(self._element_to_html_entity(table_root)) + new_table_root = self._build_html_tree(clean_html) - # doc = html.fromstring(html_content) - for element in table_root.iter(): + pattern = re.compile(r'(\s*\n\s*\n\s*|\n{2,})') + for element in new_table_root.iter(): # 清理元素前后的空白(不影响.text和.tail的内容) if element.text is not None: - element.text = element.text.lstrip('\n\t ') + element.text = re.sub(pattern, '\n\n', element.text.strip()) if element.tail is not None: - if "\n\n" in element.tail: - element.tail = "\n\n" + element.tail.lstrip('\n\t ') - else: - element.tail = element.tail.lstrip('\n\t ') + element.tail = re.sub(pattern, '\n\n', element.tail.lstrip()).rstrip() + + tree_html = element_to_html_unescaped(new_table_root) + restore_tree_html = restore_sub_sup_from_text_regex(tree_html) - return self._element_to_html_entity(table_root) + return restore_tree_html def __do_extract_tables(self, root: HtmlElement) -> None: """递归处理所有子标签.""" diff --git a/llm_web_kit/extractor/html/recognizer/text.py b/llm_web_kit/extractor/html/recognizer/text.py index 0052872c..6dc7e346 100644 --- a/llm_web_kit/extractor/html/recognizer/text.py +++ b/llm_web_kit/extractor/html/recognizer/text.py @@ -13,7 +13,9 @@ from llm_web_kit.libs.doc_element_type import DocElementType, ParagraphTextType from llm_web_kit.libs.html_utils import (element_to_html_unescaped, html_normalize_space, html_to_element, - process_sub_sup_tags) + process_sub_sup_tags, + replace_sub_sup_with_text_regex, + restore_sub_sup_from_text_regex) special_symbols = [ # TODO 从文件读取 '®', # 注册商标符号 @@ -65,7 +67,7 @@ 'samp', 'script', 'select', 'small', 'span', 'strong', 'sub', 'sup', 'textarea', 'time', 'var', 'u', 's', 'cccode-inline', 'ccmath-inline', 'marked-tail', 'marked-text', 'math','mspace', 'font', 'nobr', 'bdi', - 'mjx-container', 'mjx-assistive-mml', 'strike', 'wbr', 'ins' + 'mjx-container', 'mjx-assistive-mml', 'strike', 'wbr', 'ins', 'xhtml' } # 词间无分隔符的语言 @@ -205,7 +207,10 @@ def __combine_text(self, text1:str, text2:str, lang='en') -> str: if (text2 and text2[0] in string.punctuation) or (text2 and text2[0] in special_symbols) or (text2 and text2[0] in other_symbols) or (text1 and text1[-1] in other_symbols): words_sep = '' else: - words_sep = ' ' + if text2.startswith('tem_sub_') or text2.startswith('tem_sup_') or text1.endswith("tem_sub_start") or text1.endswith("tem_sup_start"): + words_sep = '' + else: + words_sep = ' ' txt = text1 + words_sep + text2 return self.replace_entities(txt.strip(), entities_map) @@ -222,12 +227,13 @@ def __get_paragraph_text(self, root: HtmlElement, language:str = 'en') -> List[d Args: el: 代表一个段落的html元素 """ + _html = html.tostring(root, encoding='utf-8').decode() + replace_tree_html = replace_sub_sup_with_text_regex(_html) + root = html_to_element(replace_tree_html) + para_text = [] def __get_paragraph_text_recusive(el: HtmlElement, text: str) -> str: - # 标记当前元素是否是sub或sup类型 - is_sub_sup = el.tag == 'sub' or el.tag == 'sup' - if el.tag == CCTag.CC_MATH_INLINE: if text: para_text.append({'c': text, 't': ParagraphTextType.TEXT}) @@ -254,19 +260,17 @@ def __get_paragraph_text_recusive(el: HtmlElement, text: str) -> str: # 处理尾部文本 if el.tail and el.tail.strip(): - if is_sub_sup: - _new_tail = html_normalize_space(el.tail.strip()) - text += _new_tail - else: - _new_tail = html_normalize_space(el.tail.strip()) - new_tail = f' {_new_tail}' if el.tail.startswith(' ') and el.tail.strip()[0] in string.punctuation else _new_tail - text = self.__combine_text(text, new_tail, language) + _new_tail = html_normalize_space(el.tail.strip()) + new_tail = f' {_new_tail}' if el.tail.startswith(' ') and el.tail.strip()[0] in string.punctuation else _new_tail + text = self.__combine_text(text, new_tail, language) return text if final := __get_paragraph_text_recusive(root, ''): para_text.append({'c': final.replace('$br$', PARAGRAPH_SEPARATOR), 't': ParagraphTextType.TEXT}) + for item in para_text: + item['c'] = restore_sub_sup_from_text_regex(item['c']) return para_text def __extract_paragraphs(self, root: HtmlElement): diff --git a/llm_web_kit/extractor/html/recognizer/title.py b/llm_web_kit/extractor/html/recognizer/title.py index cf105b9f..d3205344 100644 --- a/llm_web_kit/extractor/html/recognizer/title.py +++ b/llm_web_kit/extractor/html/recognizer/title.py @@ -1,6 +1,7 @@ from typing import List, Tuple # from lxml.etree import _Element as HtmlElement +from lxml import html as lxml_html from lxml.html import HtmlElement from overrides import override @@ -8,8 +9,9 @@ from llm_web_kit.extractor.html.recognizer.recognizer import ( BaseHTMLElementRecognizer, CCTag) from llm_web_kit.libs.doc_element_type import DocElementType -from llm_web_kit.libs.html_utils import (html_normalize_space, - process_sub_sup_tags) +from llm_web_kit.libs.html_utils import (html_normalize_space, html_to_element, + replace_sub_sup_with_text_regex, + restore_sub_sup_from_text_regex) from .text import PARAGRAPH_SEPARATOR @@ -90,10 +92,14 @@ def __do_extract_title(self, root:HtmlElement) -> None: """ # 匹配需要替换的标签 if root.tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: + if root.tail and root.tail.strip(): + tail_text = root.tail.strip() + else: + tail_text = '' + root.tail = None title_text = self.__extract_title_text(root) title_raw_html = self._element_to_html(root) title_level = str(self.__extract_title_level(root.tag)) - tail_text = root.tail cc_element = self._build_cc_element(CCTag.CC_TITLE, title_text, tail_text, level=title_level, html=title_raw_html) self._replace_element(root, cc_element) return @@ -122,8 +128,9 @@ def __extract_title_text(self, header_el:HtmlElement) -> str: Returns: str: 标题的文本 """ + blks = [] + def __extract_title_text_recusive(el: HtmlElement, with_tail: bool = True) -> list[str]: - blks = [] if el.tag == CCTag.CC_CODE_INLINE: blks.append(f'`{el.text}`') @@ -134,21 +141,18 @@ def __extract_title_text_recusive(el: HtmlElement, with_tail: bool = True) -> li _new_text = html_normalize_space(el.text.strip()) blks.append(_new_text) - for child in el.getchildren(): - if child.tag == 'sub' or child.tag == 'sup': - blks.extend([process_sub_sup_tags(child, '', recursive=False), child.tail]) - else: - blks.extend(__extract_title_text_recusive(child)) - if with_tail: blks.append((el.tail or '').strip()) return blks - # 根元素不保留结尾 - blks = __extract_title_text_recusive(header_el, False) + _html = lxml_html.tostring(header_el, encoding='utf-8').decode() + replace_tree_html = replace_sub_sup_with_text_regex(_html) + header_el = html_to_element(replace_tree_html) - return ' '.join(blk for blk in blks if blk).replace('$br$', PARAGRAPH_SEPARATOR) + for child in header_el.iter(): + __extract_title_text_recusive(child, True) + return restore_sub_sup_from_text_regex(' '.join(blk for blk in blks if blk).replace('$br$', PARAGRAPH_SEPARATOR)) def __get_attribute(self, html:HtmlElement) -> Tuple[int, str]: """获取element的属性.""" diff --git a/llm_web_kit/libs/html_utils.py b/llm_web_kit/libs/html_utils.py index fede6cf0..05d6783b 100644 --- a/llm_web_kit/libs/html_utils.py +++ b/llm_web_kit/libs/html_utils.py @@ -452,6 +452,37 @@ def html_normalize_space(text: str) -> str: return text +def replace_sub_sup_with_text_regex(html_content): + """使用正则表达式将 HTML 中的 、 标签替换为特殊标记。""" + + def replacer(match): + tag = match.group(0).lower() + if tag.startswith('': + return 'tem_sub_end' + if tag.startswith('': + return 'tem_sup_end' + + pattern = r']*>' + return re.sub(pattern, replacer, html_content, flags=re.IGNORECASE) + + +def restore_sub_sup_from_text_regex(processed_content): + """将、的替换标记还原为原始的 HTML 标签。""" + replacement_map = { + 'tem_sub_start': '', + 'tem_sub_end': '', + 'tem_sup_start': '', + 'tem_sup_end': '' + } + + pattern = '|'.join(re.escape(key) for key in replacement_map.keys()) + return re.sub(pattern, lambda m: replacement_map[m.group(0)], processed_content) + + def get_plain_text_fast(html_source: str) -> str: """使用lxml快速获取html中的纯文本. diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.md b/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.md index c8745567..bd8a9517 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.md +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.md @@ -4,7 +4,7 @@ ### Use Integers for Index Variables -In MATLAB®code that you want to convert to single precision, it is a best practice to use integers for index variables. However, if the code does not use integers for index variables, when possible `convertToSingle` tries to detect the index variables and select `int32` types for them. +In MATLAB® code that you want to convert to single precision, it is a best practice to use integers for index variables. However, if the code does not use integers for index variables, when possible `convertToSingle` tries to detect the index variables and select `int32` types for them. ### Limit Use of `assert` Statements diff --git a/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.txt b/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.txt index 7b4f582d..2bdc3018 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.txt +++ b/tests/llm_web_kit/extractor/html/recognizer/assets/cccode/mathworks.txt @@ -1,7 +1,7 @@ 主要内容 Single-Precision Conversion Best Practices Use Integers for Index Variables -In MATLAB®code that you want to convert to single precision, it is a best practice to use integers for index variables. However, if the code does not use integers for index variables, when possible `convertToSingle` tries to detect the index variables and select `int32` types for them. +In MATLAB® code that you want to convert to single precision, it is a best practice to use integers for index variables. However, if the code does not use integers for index variables, when possible `convertToSingle` tries to detect the index variables and select `int32` types for them. Limit Use of `assert` Statements - Do not use `assert` statements to define the properties of input arguments. - Do not use `assert` statements to test the type of a variable. For example, do not use diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_table.py b/tests/llm_web_kit/extractor/html/recognizer/test_table.py index fb3ba4ff..c80d3568 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_table.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_table.py @@ -66,7 +66,7 @@ def test_only_involve_table(self): parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) self.assertEqual(len(parts), 2) table_body = parts[1][0].text_content() - assert table_body == '
Mrs S Hindle
ShowCCRCC
Driffield 5th October 2006CH. Ricksbury Royal HeroCH. Keyingham Branwell
Manchester 16th January 2008CH. Lochbuie GeordieMerryoth Maeve
Darlington 20th September 2009CH. Maibee Make BelieveCH. Loranka Just Like Heaven JW
Blackpool 22nd June 2012CH. Loranka Sherrie BabyDear Magic Touch De La Fi Au Songeur
Welsh Kennel Club 2014Brymarden Carolina SunriseCh. Wandris Evan Elp Us
Welsh Kennel Club 2014Ch. Charnell Clematis of SalegreenCH. Byermoor Queens Maid
' + assert table_body == '
Mrs S Hindle
ShowCCRCC
Driffield 5th October 2006CH. Ricksbury Royal HeroCH. Keyingham Branwell
Manchester 16th January 2008CH. Lochbuie GeordieMerryoth Maeve
Darlington 20th September 2009CH. Maibee Make BelieveCH. Loranka Just Like Heaven JW
Blackpool 22nd June 2012CH. Loranka Sherrie BabyDear Magic Touch De La Fi Au Songeur
Welsh Kennel Club 2014Brymarden Carolina SunriseCh. Wandris Evan Elp Us
Welsh Kennel Club 2014Ch. Charnell Clematis of SalegreenCH. Byermoor Queens Maid
' def test_table_include_img_label(self): """table是否包含img标签.""" @@ -89,7 +89,7 @@ def test_cc_simple_table(self): parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) assert len(parts) == 3 content = parts[1][0].text_content() - assert content == '
Рейтинг:
Рейтинг 5.00 из 5 на основе опроса 3 пользователей
Тип товара:Препараты для омоложения
Форма:Крем
Объем:50 мл
Рецепт:Отпускается без рецепта
Способ хранения:Хранить при температуре 4-20°
Примечание:Беречь от детей
Оплата:Наличными/банковской картой
Доступность в Северске:В наличии
Доставка:2-7 Дней
Цена:84 ₽
' + assert content == '
Рейтинг:Рейтинг 5.00 из 5 на основе опроса 3 пользователей
Тип товара:Препараты для омоложения
Форма:Крем
Объем:50 мл
Рецепт:Отпускается без рецепта
Способ хранения:Хранить при температуре 4-20°
Примечание:Беречь от детей
Оплата:Наличными/банковской картой
Доступность в Северске:В наличии
Доставка:2-7 Дней
Цена:84 ₽
' def test_cc_complex_table(self): """cc跨行跨列的表格.""" @@ -100,7 +100,7 @@ def test_cc_complex_table(self): parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) assert len(parts) == 3 content = parts[1][0].text_content() - assert content == '\n\n\n\n
ফেব্রুয়ারি ২০২৪
সোমমঙ্গলবুধবৃহশুক্রশনিরবি
« জানুয়ারি
১০১১
১২১৩১৪১৫১৬১৭১৮
১৯২০২১২২২৩২৪২৫
২৬২৭২৮২৯
' + assert content == '
ফেব্রুয়ারি ২০২৪
সোমমঙ্গলবুধবৃহশুক্রশনিরবি
« জানুয়ারি
১০১১
১২১৩১৪১৫১৬১৭১৮
১৯২০২১২২২৩২৪২৫
২৬২৭২৮২৯
' table_type = parts[1][0].xpath(f'.//{CCTag.CC_TABLE}')[0] assert table_type.attrib['table_type'] == 'complex' @@ -155,10 +155,10 @@ def test_table_involve_equation(self): raw_html = raw_html_path.read_text(encoding='utf-8') parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) complex_table_tag = parts[1][0].xpath(f'.//{CCTag.CC_TABLE}') - assert complex_table_tag[0].text == '
Name of the probability distributionProbability distribution functionMeanVariance
Binomial distribution${\\displaystyle \\Pr \\,(X=k)={\\binom {n}{k}}p^{k}(1-p)^{n-k}}$ \n\n${\\displaystyle np}$ \n\n${\\displaystyle np(1-p)}$ \n\n
Geometric distribution${\\displaystyle \\Pr \\,(X=k)=(1-p)^{k-1}p}$ \n\n${\\displaystyle {\\frac {1}{p}}}$ \n\n${\\displaystyle {\\frac {(1-p)}{p^{2}}}}$ \n\n
Normal distribution${\\displaystyle f\\left(x\\mid \\mu ,\\sigma ^{2}\\right)={\\frac {1}{\\sqrt {2\\pi \\sigma ^{2}}}}e^{-{\\frac {(x-\\mu )^{2}}{2\\sigma ^{2}}}}}$ \n\n${\\displaystyle \\mu }$ \n\n${\\displaystyle \\sigma ^{2}}$ \n\n
Uniform distribution (continuous)${\\displaystyle f(x\\mid a,b)={\\begin{cases}{\\frac {1}{b-a}}&{\\text{for }}a\\leq x\\leq b,\\\\[3pt]0&{\\text{for }}xb\\end{cases}}}$ \n\n${\\displaystyle {\\frac {a+b}{2}}}$ \n\n${\\displaystyle {\\frac {(b-a)^{2}}{12}}}$ \n\n
Exponential distribution${\\displaystyle f(x\\mid \\lambda )=\\lambda e^{-\\lambda x}}$ \n\n${\\displaystyle {\\frac {1}{\\lambda }}}$ \n\n${\\displaystyle {\\frac {1}{\\lambda ^{2}}}}$ \n\n
Poisson distribution${\\displaystyle f(k\\mid \\lambda )={\\frac {e^{-\\lambda }\\lambda ^{k}}{k!}}}$ \n\n${\\displaystyle \\lambda }$ \n\n${\\displaystyle \\lambda }$ \n\n
' + assert complex_table_tag[0].text == '
Name of the probability distributionProbability distribution functionMeanVariance
Binomial distribution${\\displaystyle \\Pr \\,(X=k)={\\binom {n}{k}}p^{k}(1-p)^{n-k}}$${\\displaystyle np}$${\\displaystyle np(1-p)}$
Geometric distribution${\\displaystyle \\Pr \\,(X=k)=(1-p)^{k-1}p}$${\\displaystyle {\\frac {1}{p}}}$${\\displaystyle {\\frac {(1-p)}{p^{2}}}}$
Normal distribution${\\displaystyle f\\left(x\\mid \\mu ,\\sigma ^{2}\\right)={\\frac {1}{\\sqrt {2\\pi \\sigma ^{2}}}}e^{-{\\frac {(x-\\mu )^{2}}{2\\sigma ^{2}}}}}$${\\displaystyle \\mu }$${\\displaystyle \\sigma ^{2}}$
Uniform distribution (continuous)${\\displaystyle f(x\\mid a,b)={\\begin{cases}{\\frac {1}{b-a}}&{\\text{for }}a\\leq x\\leq b,\\\\[3pt]0&{\\text{for }}xb\\end{cases}}}$${\\displaystyle {\\frac {a+b}{2}}}$${\\displaystyle {\\frac {(b-a)^{2}}{12}}}$
Exponential distribution${\\displaystyle f(x\\mid \\lambda )=\\lambda e^{-\\lambda x}}$${\\displaystyle {\\frac {1}{\\lambda }}}$${\\displaystyle {\\frac {1}{\\lambda ^{2}}}}$
Poisson distribution${\\displaystyle f(k\\mid \\lambda )={\\frac {e^{-\\lambda }\\lambda ^{k}}{k!}}}$${\\displaystyle \\lambda }$${\\displaystyle \\lambda }$
' parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) complex_table_tag = parts[1][0].xpath(f'.//{CCTag.CC_TABLE}') - assert complex_table_tag[0].text == '
Name of the probability distributionProbability distribution functionMeanVariance
Binomial distribution${\\displaystyle \\Pr \\,(X=k)={\\binom {n}{k}}p^{k}(1-p)^{n-k}}$ \n\n${\\displaystyle np}$ \n\n${\\displaystyle np(1-p)}$ \n\n
Geometric distribution${\\displaystyle \\Pr \\,(X=k)=(1-p)^{k-1}p}$ \n\n${\\displaystyle {\\frac {1}{p}}}$ \n\n${\\displaystyle {\\frac {(1-p)}{p^{2}}}}$ \n\n
Normal distribution${\\displaystyle f\\left(x\\mid \\mu ,\\sigma ^{2}\\right)={\\frac {1}{\\sqrt {2\\pi \\sigma ^{2}}}}e^{-{\\frac {(x-\\mu )^{2}}{2\\sigma ^{2}}}}}$ \n\n${\\displaystyle \\mu }$ \n\n${\\displaystyle \\sigma ^{2}}$ \n\n
Uniform distribution (continuous)${\\displaystyle f(x\\mid a,b)={\\begin{cases}{\\frac {1}{b-a}}&{\\text{for }}a\\leq x\\leq b,\\\\[3pt]0&{\\text{for }}xb\\end{cases}}}$ \n\n${\\displaystyle {\\frac {a+b}{2}}}$ \n\n${\\displaystyle {\\frac {(b-a)^{2}}{12}}}$ \n\n
Exponential distribution${\\displaystyle f(x\\mid \\lambda )=\\lambda e^{-\\lambda x}}$ \n\n${\\displaystyle {\\frac {1}{\\lambda }}}$ \n\n${\\displaystyle {\\frac {1}{\\lambda ^{2}}}}$ \n\n
Poisson distribution${\\displaystyle f(k\\mid \\lambda )={\\frac {e^{-\\lambda }\\lambda ^{k}}{k!}}}$ \n\n${\\displaystyle \\lambda }$ \n\n${\\displaystyle \\lambda }$ \n\n
' + assert complex_table_tag[0].text == '
Name of the probability distributionProbability distribution functionMeanVariance
Binomial distribution${\\displaystyle \\Pr \\,(X=k)={\\binom {n}{k}}p^{k}(1-p)^{n-k}}$${\\displaystyle np}$${\\displaystyle np(1-p)}$
Geometric distribution${\\displaystyle \\Pr \\,(X=k)=(1-p)^{k-1}p}$${\\displaystyle {\\frac {1}{p}}}$${\\displaystyle {\\frac {(1-p)}{p^{2}}}}$
Normal distribution${\\displaystyle f\\left(x\\mid \\mu ,\\sigma ^{2}\\right)={\\frac {1}{\\sqrt {2\\pi \\sigma ^{2}}}}e^{-{\\frac {(x-\\mu )^{2}}{2\\sigma ^{2}}}}}$${\\displaystyle \\mu }$${\\displaystyle \\sigma ^{2}}$
Uniform distribution (continuous)${\\displaystyle f(x\\mid a,b)={\\begin{cases}{\\frac {1}{b-a}}&{\\text{for }}a\\leq x\\leq b,\\\\[3pt]0&{\\text{for }}xb\\end{cases}}}$${\\displaystyle {\\frac {a+b}{2}}}$${\\displaystyle {\\frac {(b-a)^{2}}{12}}}$
Exponential distribution${\\displaystyle f(x\\mid \\lambda )=\\lambda e^{-\\lambda x}}$${\\displaystyle {\\frac {1}{\\lambda }}}$${\\displaystyle {\\frac {1}{\\lambda ^{2}}}}$
Poisson distribution${\\displaystyle f(k\\mid \\lambda )={\\frac {e^{-\\lambda }\\lambda ^{k}}{k!}}}$${\\displaystyle \\lambda }$${\\displaystyle \\lambda }$
' def test_table_involve_after_code(self): """test table involve code, code被提取出去了,过滤掉空的和坏的table.""" @@ -203,7 +203,7 @@ def test_nested_table1(self): parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) assert len(parts) == 3 content = parts[2][0].text_content() - assert '\n\n
\n\n
Advanced Search
\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n
Home
Browse
Communities \n\n & Collections
Issue Date
Author
Title
Subject
Sign on to:
Receive email \n\n updates
My APO\n\n
authorized users
Edit Profile
About DSpace
\n\n

ANSTO Publications Online > \n\n Journal Publications > \n\n Journal Articles >

\n\n
Please use this identifier to cite or link to this item: http://apo.ansto.gov.au/dspace/handle/10238/2935
\n\n
' in content + assert '
Title:An investigation into transition metal ion binding properties of silk fibers and particles using radioisotopes.
Authors:Rajkhowa, R\n\n
Naik, R\n\n
Wang, L\n\n
Smith, SV\n\n
Wang, X
Keywords:Radioisotopes \n\n Transition Elements
\n\nBinding Energy
\n\nFibers
\n\nAbsorption
\n\nIons
Issue Date:15-Mar-2011
Publisher:Wiley-Blackwell
Citation:
Search APO
Advanced Search
Home
Browse
Communities\n\n& Collections
Issue Date
Author
Title
Subject
Sign on to:
Receive email\n\nupdates
My APO\n\nauthorized users
Edit Profile
Help
About DSpace
ANSTO Publications Online >\n\nJournal Publications >\n\nJournal Articles >
Please use this identifier to cite or link to this item: http://apo.ansto.gov.au/dspace/handle/10238/2935
' in content def test_nested_table2(self): """复杂嵌套表格.""" @@ -213,7 +213,7 @@ def test_nested_table2(self): parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) assert len(parts) == 2 content = parts[1][0].text_content() - assert '
Title:An investigation into transition metal ion binding properties of silk fibers and particles using radioisotopes.
Authors:Rajkhowa, R\n\nNaik, R\n\nWang, L\n\nSmith, SV\n\nWang, X
Keywords:Radioisotopes\n\nTransition Elements\n\nBinding Energy\n\nFibers\n\nAbsorption\n\nIons
Issue Date:15-Mar-2011
Publisher:Wiley-Blackwell
jQuery(document).ready( function($) { if ($(\'#gateway-page\').length) { jQuery("body").addClass("fontyourface layout-one-sidebar layout-sidebar-first wide hff-43 pff-43 sff-43 slff-43 fixed-header-enabled slideout-side-right transparent-header-active path-node page-node-type-page"); }}); \n\n .acalog-custom .region--light-typography.region--dark-background a {font-weight:normal;} .acalog-custom ul.icons-list {margin:0} .acalog-custom ul.icons-list li {margin:5px 12px 5px 0;} #gateway-footer-copyright {background:#f6f8f9; font-family:\'Libre Franklin\', Helvetica Neue, Arial, sans-serif; padding:20px;} \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n window.dataLayer = window.dataLayer || []; function gtag(){dataLayer.push(arguments);} gtag(\'js\', new Date()); gtag(\'config\', \'G-L4J2WT8RM8\'); \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n Main Numbers: \n\n (615) 452-8600 \n\n (888) 335-8722 \n\n \n\n \n\n \n\n \n\n facebook \n\n instagram \n\n twitter \n\n youtube \n\n \n\n \n\n \n\n \n\n Campuses \n\n \n\n Gallatin \n\n Cookeville \n\n Livingston \n\n Springfield \n\n \n\n \n\n \n\n \n\n \n\n Academic Divisions \n\n \n\n Business & Technology \n\n Health Sciences \n\n Humanities & Fine Arts \n\n Mathematics & Science \n\n Nursing \n\n Social Science & Education \n\n \n\n \n\n \n\n \n\n \n\n Resources \n\n \n\n Accreditation \n\n Bookstore \n\n Campus Police \n\n Contact Us \n\n Employee Directory \n\n IT Help Desk \n\n Library \n\n Marketing & Communications
Volunteer State Community College
May 24, 2024
2013-2014 VSCC Catalog
Select a Catalog \n\n 2024-2025 Undergraduate Catalog \n\n 2023-2024 Undergraduate Catalog [ARCHIVED CATALOG] \n\n 2022-2023' in content + assert '
jQuery(document).ready( function($) { if ($(\'#gateway-page\').length) { jQuery("body").addClass("fontyourface layout-one-sidebar layout-sidebar-first wide hff-43 pff-43 sff-43 slff-43 fixed-header-enabled slideout-side-right transparent-header-active path-node page-node-type-page"); }});\n\n.acalog-custom .region--light-typography.region--dark-background a {font-weight:normal;} .acalog-custom ul.icons-list {margin:0} .acalog-custom ul.icons-list li {margin:5px 12px 5px 0;} #gateway-footer-copyright {background:#f6f8f9; font-family:\'Libre Franklin\', Helvetica Neue, Arial, sans-serif; padding:20px;}\n\nwindow.dataLayer = window.dataLayer || []; function gtag(){dataLayer.push(arguments);} gtag(\'js\', new Date()); gtag(\'config\', \'G-L4J2WT8RM8\');\n\nMain Numbers:\n\n(615) 452-8600\n\n(888) 335-8722\n\nfacebook\n\ninstagram\n\ntwitter\n\nyoutube\n\nCampuses\n\nGallatin\n\nCookeville\n\nLivingston\n\nSpringfield\n\nAcademic Divisions\n\nBusiness & Technology\n\nHealth Sciences\n\nHumanities & Fine Arts\n\nMathematics & Science\n\nNursing\n\nSocial Science & Education\n\nResources\n\nAccreditation\n\nBookstore\n\nCampus Police\n\nContact Us\n\nEmployee Directory\n\nIT Help Desk\n\nLibrary\n\nMarketing & Communications
Volunteer State Community College
May 24, 2024
2013-2014 VSCC Catalog
Select a Catalog\n\n2024-2025 Undergraduate Catalog\n\n2023-2024 Undergraduate Catalog [ARCHIVED CATALOG]\n\n2022-2023' in content def test_nested_table3(self): """复杂嵌套表格.""" @@ -223,7 +223,7 @@ def test_nested_table3(self): parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) assert len(parts) == 3 content = parts[2][0].text_content() - assert "
What's New - Recent Content \n\n \n\n Members' Peak Updates \n\n Recent Trip Reports \n\n Recent Trip Report Comments \n\n Recently added Images \n\n Recently added Peaks \n\n List Completers \n\n \n\n Height List Completers \n\n Elevation List Completers \n\n County Summit Completers \n\n Wilderness Area Completers \n\n Member Profiles & Stats \n\n \n\n Member Profiles - Summary Stats \n\n Member Stats by Date Range & Charts \n\n Calendar Grid Completions \n\n Peaks Repeated \n\n Most Climbed Peaks \n\n Unclimbed Peaks \n\n US Peak Totals by State \n\n Member Tools \n\n \n\n Closest 50 Peaks by Member \n\n \n\n Closest 50 Map \n\n Closest 50 List \n\n Download your Peak List \n\n Search Trip Reports \n\n Unclimbed by Custom Group \n\n Export CSV, GPX, POI, TOPO! Files \n\n Elevation Threshold Progress Maps \n\n State Highest # Progress Maps \n\n County Summit Progress Maps \n\n Statewide County Summit Maps \n\n Prominence Progress Maps \n\n State Quads Progress Maps \n\n Quadrangle Lookup \n\n Distance Calculator \n\n Slope Angle Calculator \n\n Stats Category Leaders \n\n US Highest 1,000 Peaks \n\n \n\n US Highest 1,000 Member Area \n\n 1,000 Highest Peak List \n\n US Steepest 1,000 Peaks \n\n \n\n Steepness Member Area \n\n View 1,000 Steepest List \n\n US 2,000' Prominence \n\n \n\n US Prominence Member Area \n\n View US Prominence Peak Profiles \n\n View Member 5k Completion Maps \n\n Prominence Progress Maps \n\n US County Highpoints \n\n \n\n County Highpoints Member Area \n\n Highpoint Profiles - By State \n\n View Member's Completion Maps \n\n US State Highpoints \n\n \n\n US State Highpoints Member Area \n\n View State Highpoints List \n\n View Member's Completion Maps \n\n US Wilderness Area Peaks \n\n \n\n Wilderness Summits Member Area \n\n Wilderness Area Detail by State \n\n Wilderness HPs Member Area \n\n US National Park Peaks \n\n \n\n National Park Peaks Member Area \n\n National Park Peaks Detail by State" in content + assert "
What's New - Recent Content\n\nMembers' Peak Updates\n\nRecent Trip Reports\n\nRecent Trip Report Comments\n\nRecently added Images\n\nRecently added Peaks\n\nList Completers\n\nHeight List Completers\n\nElevation List Completers\n\nCounty Summit Completers\n\nWilderness Area Completers\n\nMember Profiles & Stats\n\nMember Profiles - Summary Stats\n\nMember Stats by Date Range & Charts\n\nCalendar Grid Completions\n\nPeaks Repeated\n\nMost Climbed Peaks\n\nUnclimbed Peaks\n\nUS Peak Totals by State\n\nMember Tools\n\nClosest 50 Peaks by Member\n\nClosest 50 Map\n\nClosest 50 List\n\nDownload your Peak List\n\nSearch Trip Reports\n\nUnclimbed by Custom Group\n\nExport CSV, GPX, POI, TOPO! Files\n\nElevation Threshold Progress Maps\n\nState Highest # Progress Maps\n\nCounty Summit Progress Maps\n\nStatewide County Summit Maps\n\nProminence Progress Maps\n\nState Quads Progress Maps\n\nQuadrangle Lookup\n\nDistance Calculator\n\nSlope Angle Calculator\n\nStats Category Leaders\n\nUS Highest 1,000 Peaks\n\nUS Highest 1,000 Member Area\n\n1,000 Highest Peak List\n\nUS Steepest 1,000 Peaks\n\nSteepness Member Area\n\nView 1,000 Steepest List\n\nUS 2,000' Prominence\n\nUS Prominence Member Area\n\nView US Prominence Peak Profiles\n\nView Member 5k Completion Maps\n\nProminence Progress Maps\n\nUS County Highpoints\n\nCounty Highpoints Member Area\n\nHighpoint Profiles - By State\n\nView Member's Completion Maps\n\nUS State Highpoints\n\nUS State Highpoints Member Area\n\nView State Highpoints List\n\nView Member's Completion Maps\n\nUS Wilderness Area Peaks\n\nWilderness Summits Member Area\n\nWilderness Area Detail by State" in content def test_nested_table4(self): """复杂嵌套表格.""" @@ -233,4 +233,4 @@ def test_nested_table4(self): parts = self.rec.recognize(base_url, [(html_to_element(raw_html), html_to_element(raw_html))], raw_html) assert len(parts) == 4 content = parts[2][0].text_content() - assert '

Molecular line emissions from pre main sequence objects

Saraceno, P. ; Benedettini, M. ; Caux, E. ; Ceccarelli, M. C. ; Clegg, P. E. ; Correia, J. C. ; di Giorgio, A. M. ; Giannini, T. ; Griffin, M. J. ; Leeks, S. J. ; Liseau, R. ; Lorenzetti, D. ; Molinari, S. ; Nisini, B. ; Smith, H. ; Spinoglio, L. ; Tomassi, E. and White, G. J. (1997). \n\n Molecular line emissions from pre main sequence objects. \n\n In: The first ISO workshop on Analytical Spectroscopy , 6-8 October 1997, Madrid, Spain, p. 291. \n\n Full text available as:

\n\n
Preview
\n\n
PDF (Version of Record) - Requires a PDF viewer such asGSview ,Xpdf orAdobe Acrobat Reader\n\n
Download (239Kb)
    \n\n\n\n
    URL:http://cdsads.u-strasbg.fr/abs/1997ESASP.419..291S
    Google Scholar:Look up in Google Scholar
    \n\n

    Abstract

    We present some preliminary results obtained with the LWS G.T. programme on the study of young objects driving molecular outflows. In particular, we discuss the importance of molecular emission in these sources and address the role of the H 2 0 cooling.

    \n\n\n\n\n\n\n\n\n\n\n\n\n\n' in content + assert '
    Item Type:Conference Item
    Copyright Holders:1997 European Space Agency
    Extra Information:Proceedings of the first ISO workshop on Analytical Spectroscopy, Madrid, Spain, 6-8 October 1997. Editors: A.M. Heras, K. Leech, N. R. Trams, and Michael Perry. Noordwijk, The Netherlands : ESA Publications Division, c1997. (ESA SP-419), 1997., pp.291-292
    Academic Unit/Department:Science > Physical Sciences
    Interdisciplinary Research Centre:Centre for Earth, Planetary, Space and Astronomical Research (CEPSAR)
    Item ID:32696
    Depositing User:Glenn White
    Molecular line emissions from pre main sequence objects\n\nSaraceno, P. ; Benedettini, M. ; Caux, E. ; Ceccarelli, M. C. ; Clegg, P. E. ; Correia, J. C. ; di Giorgio, A. M. ; Giannini, T. ; Griffin, M. J. ; Leeks, S. J. ; Liseau, R. ; Lorenzetti, D. ; Molinari, S. ; Nisini, B. ; Smith, H. ; Spinoglio, L. ; Tomassi, E. and White, G. J. (1997). Molecular line emissions from pre main sequence objects. In: The first ISO workshop on Analytical Spectroscopy , 6-8 October 1997, Madrid, Spain, p. 291.\n\nFull text available as:
    Preview
    PDF (Version of Record) - Requires a PDF viewer such asGSview ,Xpdf orAdobe Acrobat Reader\n\nDownload (239Kb)
    URL:http://cdsads.u-strasbg.fr/abs/1997ESASP.419..291S
    Google Scholar:Look up in Google Scholar
    Abstract\n\nWe present some preliminary results obtained with the LWS G.T. programme on the study of young objects driving molecular outflows. In particular, we discuss the importance of molecular emission in these sources and address the role of the H20 cooling.' in content diff --git a/tests/llm_web_kit/extractor/html/recognizer/test_title.py b/tests/llm_web_kit/extractor/html/recognizer/test_title.py index a0b71fa4..2edfbeab 100644 --- a/tests/llm_web_kit/extractor/html/recognizer/test_title.py +++ b/tests/llm_web_kit/extractor/html/recognizer/test_title.py @@ -18,17 +18,15 @@ def test_title_recognizer(title_recognizer): result = title_recognizer.recognize('http://www.baidu.com', [(html_content, html_content)], html_content) assert len(result) == 10 - assert element_to_html(result[0][0]) == """大模型好,大模型棒1""" - assert element_to_html(result[6][0]) == """大模型好,大模型棒5 大模型很棒""" + assert element_to_html(result[0][0]) == """大模型好,大模型棒1""" + assert element_to_html(result[6][0]) == """大模型好,大模型棒5 大模型很棒""" def test_title_tails_and_levels(title_recognizer): html_content = """

    TEST:import *TEST

    Tail

    aaa

    """ result = title_recognizer.recognize('http://www.baidu.com', [(html_content, html_content)], html_content) assert len(result) == 2 - assert element_to_html(result[0][0]) == '
    TEST: `import *` TEST
    ' + assert element_to_html(result[0][0]) == '
    TEST: `import *` TEST
    ' pass @@ -47,4 +45,4 @@ def test_title1(title_recognizer): with open(f'{os.path.dirname(os.path.abspath(__file__))}/assets/recognizer/title1.html', 'r') as file: html_content = file.read() result = title_recognizer.recognize('http://www.baidu.com', [(main_html_content, main_html_content)], html_content) - assert 'Compare vibrational frequencies for two calculations for C <sub>3</sub> (carbon trimer)' in element_to_html(result[1][0]) + assert 'Compare vibrational frequencies for two calculations for C<sub>3</sub> (carbon trimer)' in element_to_html(result[1][0]) diff --git a/tests/llm_web_kit/extractor/test_extractor_chain.py b/tests/llm_web_kit/extractor/test_extractor_chain.py index 5255efe9..533470ef 100644 --- a/tests/llm_web_kit/extractor/test_extractor_chain.py +++ b/tests/llm_web_kit/extractor/test_extractor_chain.py @@ -374,7 +374,7 @@ def test_table_include_math_p(self): result = chain.extract(input_data) content_list = result.get_content_list()._get_data() assert len(content_list[0]) == 17 - assert content_list[0][3]['content']['html'] == "
    Item Type:Conference Item
    Copyright Holders:1997 European Space Agency
    Extra Information:Proceedings of the first ISO workshop on Analytical Spectroscopy, Madrid, Spain, 6-8 October 1997. Editors: A.M. Heras, K. Leech, N. R. Trams, and Michael Perry. Noordwijk, The Netherlands : ESA Publications Division, c1997. (ESA SP-419), 1997., pp.291-292
    Academic Unit/Department:Science > Physical Sciences
    Interdisciplinary Research Centre:Centre for Earth, Planetary, Space and Astronomical Research (CEPSAR)
    Item ID:32696
    Depositing User:Glenn White
    up vote 17 down vote favorite \n\n 5
    I'm having problems with exercises on proving whether or not a given number is prime. Is $83^{27} + 1$ prime?
    prime-numbers factoring
    " + assert content_list[0][3]['content']['html'] == "
    up vote 17 down vote favorite\n\n5I'm having problems with exercises on proving whether or not a given number is prime. Is $83^{27} + 1$ prime?\n\nprime-numbers factoring
    " def test_table_include_math_p_2(self): """table包含math和其他内容.""" @@ -386,7 +386,7 @@ def test_table_include_math_p_2(self): md_content = result.get_content_list().to_nlp_md() # with open('output_badcase_p2.md', 'w', encoding='utf-8') as f: # f.write(md_content) - self.assertIn('
    单位换算:

    数学公式区块: $1\\text{km}={10}^{3}\\text{m}$

    ', md_content) + self.assertIn('
    长度质量时间
    单位换算:数学公式区块: $1\\text{km}={10}^{3}\\text{m}$', md_content) def test_clean_tags(self): """测试clean_tag的preExtractor是否生效.""" @@ -491,7 +491,7 @@ def test_more_nt(self): result_content_list = result.get_content_list()._get_data() result = result_content_list[0][2]['content']['html'] assert '\n\t' not in result - assert len(result) == 2205 + assert len(result) == 1893 def test_math_physicsforums(self): """测试math_physicsforums网页中数学公式是[tex]和[itex]包裹的,且中间还有
    标签分割.""" @@ -636,7 +636,7 @@ def test_table_lack_pre_content(self): input_data = DataJson(test_data) result = chain.extract(input_data) result_content_list = result.get_content_list()._get_data() - assert result_content_list[0][22]['content']['html'] == '
    长度质量时间
    お名前【必須】お名前(カナ)
    ご連絡先【いずれか必須】

    メールアドレス

    電話番号

    ※メール受信制限をしている方は、@chintai.co.jpからのメールを受信できるよう設定の変更をお願い致します。

    ' + assert result_content_list[0][22]['content']['html'] == '
    お名前【必須】お名前(カナ)
    ご連絡先【いずれか必須】
    メールアドレス電話番号
    ※メール受信制限をしている方は、@chintai.co.jpからのメールを受信できるよう設定の変更をお願い致します。
    ' def test_td_include_specila_symbol(self): """测试td包含特殊符号|,需要转义."""