This topic has 4 replies, 2 voices, and was last updated 1 year agoby .
- You must be logged in to reply to this topic.
diff --git a/llm_web_kit/main_html_parser/parser/tag_simplifier.py b/llm_web_kit/main_html_parser/parser/tag_simplifier.py index eede6cfb..493d46cc 100644 --- a/llm_web_kit/main_html_parser/parser/tag_simplifier.py +++ b/llm_web_kit/main_html_parser/parser/tag_simplifier.py @@ -19,12 +19,11 @@ def parse(self, pre_data: PreDataJson) -> PreDataJson: """ # 获取输入数据 typical_raw_html = pre_data.get(PreDataJsonKey.TYPICAL_RAW_HTML, '') - is_xpath = pre_data.get(PreDataJsonKey.IS_XPATH, True) # layout_file_list = pre_data.get(PreDataJsonKey.LAYOUT_FILE_LIST, []) # 执行HTML标签简化逻辑 try: - simplified_html, original_html, _ = simplify_html(typical_raw_html, is_xpath=is_xpath) + simplified_html, original_html = simplify_html(typical_raw_html) except TagSimplifiedParserException as e1: raise e1 except Exception as e2: @@ -33,6 +32,5 @@ def parse(self, pre_data: PreDataJson) -> PreDataJson: # 设置输出数据 pre_data[PreDataJsonKey.TYPICAL_RAW_TAG_HTML] = original_html # 保存原始标签HTML pre_data[PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML] = simplified_html # 保存简化后的HTML - pre_data[PreDataJsonKey.XPATH_MAPPING] = _ # 保存xpath return pre_data diff --git a/llm_web_kit/main_html_parser/simplify_html/simplify_html.py b/llm_web_kit/main_html_parser/simplify_html/simplify_html.py index bfff8f29..3c37da8e 100644 --- a/llm_web_kit/main_html_parser/simplify_html/simplify_html.py +++ b/llm_web_kit/main_html_parser/simplify_html/simplify_html.py @@ -3,33 +3,34 @@ import uuid from typing import Dict, List, Tuple -from bs4 import BeautifulSoup from lxml import etree, html +from selectolax.parser import HTMLParser # 行内标签 inline_tags = { 'map', 'optgroup', 'span', 'br', 'input', 'time', 'u', 'strong', 'textarea', 'small', 'sub', 'samp', 'blink', 'b', 'code', 'nobr', 'strike', 'bdo', 'basefont', 'abbr', 'var', 'i', 'cccode-inline', - 'select', 's', 'pic', 'label', 'mark', 'object', 'dd', 'dt', 'ccmath-inline', 'svg', 'li', + 'select', 's', 'pic', 'label', 'mark', 'object', 'ccmath-inline', 'svg', 'button', 'a', 'font', 'dfn', 'sup', 'kbd', 'q', 'script', 'acronym', 'option', 'img', 'big', 'cite', 'em', 'marked-tail', 'marked-text' - # 'td', 'th' + # 'td', 'th', 'dd', 'dt', 'li' } +# 表格内部可能包含的跟表格相关的标签 +table_tags_set = {"caption", "colgroup", "col", "thead", "tbody", "tfoot", "tr", "td", "th"} + # 需要删除的标签 tags_to_remove = { + 'title', 'head', - 'header', - 'footer', 'nav', - 'aside', 'style', 'script', 'noscript', 'link', 'meta', 'iframe', - 'frame' + 'frame', } # 需要保留的特殊标签(即使它们是行内标签) @@ -37,7 +38,7 @@ # 需要删除的属性名模式(独立单词) ATTR_PATTERNS_TO_REMOVE = { - 'nav', 'footer', 'header', # 独立单词 + 'nav', # 'footer', 'header', # 独立单词 } # 需要删除的属性名模式(特定前缀/后缀) @@ -72,90 +73,76 @@ def build_uid_map(dom: html.HtmlElement) -> Dict[str, html.HtmlElement]: return {node.get('data-uid'): node for node in dom.iter() if node.get('data-uid')} -def is_unique_attribute(tree, attr_name, attr_value): - """检查给定的属性名和值组合是否在文档中唯一。""" - elements = tree.xpath(f"//*[@{attr_name}='{attr_value}']") - return len(elements) == 1 - - -def get_relative_xpath(element): - root_tree = element.getroottree() - current_element = element - path_from_element = [] - found_unique_ancestor = False - - # 从当前元素开始向上查找 - while current_element is not None and current_element.getparent() is not None: - siblings = [sib for sib in current_element.getparent() if sib.tag == current_element.tag] - - # 检查当前元素是否有唯一属性 - unique_attr = None - candidate_attrs = [ - attr for attr in current_element.attrib - if not (attr.startswith('data-') or attr == 'style' or - attr == '_item_id' or - (current_element.attrib[attr].startswith('{') and current_element.attrib[attr].endswith('}'))) - ] - - for attr in candidate_attrs: - if is_unique_attribute(root_tree, attr, current_element.attrib[attr]): - unique_attr = attr +def judge_table_parent(table_element, node_list): + for node in node_list: + ancestor = node.getparent() + while ancestor is not None: + if ancestor is table_element: + return True + elif ancestor.tag == 'table': break - - # 如果有唯一属性,构建相对路径并停止向上查找 - if unique_attr is not None: - path_from_element.insert(0, f'*[@{unique_attr}="{current_element.attrib[unique_attr]}"]') - found_unique_ancestor = True - break - else: - # 没有唯一属性,使用常规方式 - if len(siblings) > 1: - index = siblings.index(current_element) + 1 - path_from_element.insert(0, f'{current_element.tag}[{index}]') - else: - path_from_element.insert(0, current_element.tag) - - current_element = current_element.getparent() - - # 构建最终的XPath - if found_unique_ancestor: - return f'//{"/".join(path_from_element)}' - else: - # 如果没有找到唯一属性祖先,返回完整路径 - return f'//{"/".join(path_from_element)}' + ancestor = ancestor.getparent() + return False def is_data_table(table_element: html.HtmlElement) -> bool: """判断表格是否是数据表格而非布局表格.""" - # 检查表格是否有 caption 标签 - if table_element.xpath('.//caption'): - return True - - # 检查是否有 th 标签 - if table_element.xpath('.//th'): + # 检查当前表格(不包括内部嵌套表格)是否有 caption 标签 + caption_nodes = table_element.xpath('.//caption') + if judge_table_parent(table_element, caption_nodes): return True - # 检查是否有 thead 或 tfoot 标签 - if table_element.xpath('.//thead') or table_element.xpath('.//tfoot'): + # 检查当前表格(不包括内部嵌套表格)是否有 colgroup 或 col 标签 + col_nodes = table_element.xpath('.//col') + colgroup_nodes = table_element.xpath('.//colgroup') + if judge_table_parent(table_element, col_nodes) or judge_table_parent(table_element, colgroup_nodes): return True - # 检查是否有 colgroup 或 col 标签 - if table_element.xpath('.//colgroup') or table_element.xpath('.//col'): - return True - - # 检查是否有 summary 属性 - if table_element.get('summary'): + # 检查当前表格(不包括内部嵌套表格)单元格是否有 headers 属性 + cell_nodes = table_element.xpath(".//*[self::td or self::th][@headers]") + if judge_table_parent(table_element, cell_nodes): return True # 检查是否有 role="table" 或 data-table 属性 if table_element.get('role') == 'table' or table_element.get('data-table'): return True - # 检查单元格是否有 headers 属性 - if table_element.xpath('.//*[@headers]'): + for node in table_element.iterdescendants(): + if node.tag in table_tags_set: + continue + if node.tag not in inline_tags: + return False + + return True + + +def has_non_listitem_children(list_element): + """检查列表元素是否包含非列表项的直接子节点. + + :param list_element: lxml元素对象 (ul, ol, dl) + :return: True 如果存在非列表项的直接子节点,否则 False + """ + + # 根据列表类型确定允许的子元素标签 + if list_element.tag in ['ul', 'ol']: + allowed_tags = {'li'} + elif list_element.tag == 'dl': + allowed_tags = {'dt', 'dd'} + + # 使用XPath直接查找是否存在不允许的直接子元素 + # 例如,对于
In mathematics, an Appell sequence, named after Paul Émile Appell, is any polynomial sequence {pn(x)}n = 0, 1, 2, ... satisfying the identity
\( {d \over dx} p_n(x) = np_{n-1}(x), \)
and in which p0(x) is a non-zero constant.
Among the most notable Appell sequences besides the trivial example { xn } are the Hermite polynomials, the Bernoulli polynomials, and the Euler polynomials. Every Appell sequence is a Sheffer sequence, but most Sheffer sequences are not Appell sequences.
Equivalent characterizations of Appell sequences
The following conditions on polynomial sequences can easily be seen to be equivalent:
For n = 1, 2, 3, ...,
\( {d \over dx} p_n(x) = np_{n-1}(x) \)
and p0(x) is a non-zero constant;
For some sequence {cn}n = 0, 1, 2, ... of scalars with c0 ≠ 0,
\( p_n(x) = \sum_{k=0}^n {n \choose k} c_k x^{n-k}; \)
For the same sequence of scalars,
\( p_n(x) = \left(\sum_{k=0}^\infty {c_k \over k!} D^k\right) x^n, \)
where
D = {d \over dx};
For n = 0, 1, 2, ...,
p_n(x+y) = \sum_{k=0}^n {n \choose k} p_k(x) y^{n-k}.
Recursion formula
Suppose
\( p_n(x) = \left(\sum_{k=0}^\infty {c_k \over k!} D^k\right) x^n = Sx^n, \)
where the last equality is taken to define the linear operator S on the space of polynomials in x. Let
\( T = S^{-1} = \left(\sum_{k=0}^\infty {c_k \over k!} D^k\right)^{-1} = \sum_{k=1}^\infty {a_k \over k!} D^k \)
be the inverse operator, the coefficients ak being those of the usual reciprocal of a formal power series, so that
\( Tp_n(x) = x^n.\, \)
In the conventions of the umbral calculus, one often treats this formal power series T as representing the Appell sequence {pn}. One can define
\( \log T = \log\left(\sum_{k=0}^\infty {a_k \over k!} D^k \right) \)
by using the usual power series expansion of the log(1 + x) and the usual definition of composition of formal power series. Then we have
\( p_{n+1}(x) = (x - (\log T)')p_n(x).\, \)
(This formal differentiation of a power series in the differential operator D is an instance of Pincherle differentiation.)
In the case of Hermite polynomials, this reduces to the conventional recursion formula for that sequence.
Subgroup of the Sheffer polynomials
The set of all Appell sequences is closed under the operation of umbral composition of polynomial sequences, defined as follows. Suppose { pn(x) : n = 0, 1, 2, 3, ... } and { qn(x) : n = 0, 1, 2, 3, ... } are polynomial sequences, given by
\( p_n(x)=\sum_{k=0}^n a_{n,k}x^k\ \mbox{and}\ q_n(x)=\sum_{k=0}^n b_{n,k}x^k. \)
Then the umbral composition p o q is the polynomial sequence whose nth term is
\( (p_n\circ q)(x)=\sum_{k=0}^n a_{n,k}q_k(x)=\sum_{0\le k \le \ell \le n} a_{n,k}b_{k,\ell}x^\ell \)
(the subscript n appears in pn, since this is the n term of that sequence, but not in q, since this refers to the sequence as a whole rather than one of its terms).
Under this operation, the set of all Sheffer sequences is a non-abelian group, but the set of all Appell sequences is an abelian subgroup. That it is abelian can be seen by considering the fact that every Appell sequence is of the form
\( p_n(x) = \left(\sum_{k=0}^\infty {c_k \over k!} D^k\right) x^n, \)
and that umbral composition of Appell sequences corresponds to multiplication of these formal power series in the operator D.
Different convention
Another convention followed by some authors (see Chihara) defines this concept in a different way, conflicting with Appell's original definition, by using the identity
\( {d \over dx} p_n(x) = p_{n-1}(x) \)
instead.
See also
Sheffer sequence
Umbral calculus
Generalized Appell polynomials
Wick product
References
Paul Appell, "Sur une classe de polynômes", Annales scientifiques de l'École Normale Supérieure 2e série, tome 9, 1880.
Steven Roman and Gian-Carlo Rota, "The Umbral Calculus", Advances in Mathematics, volume 27, pages 95 – 188, (1978).
G.-C. Rota, D. Kahaner, and A. Odlyzko, "Finite Operator Calculus", Journal of Mathematical Analysis and its Applications, vol. 42, no. 3, June 1973. Reprinted in the book with the same title, Academic Press, New York, 1975.
Steven Roman. The Umbral Calculus. Dover Publications.
Theodore Seio Chihara (1978). An Introduction to Orthogonal Polynomials. Gordon and Breach, New York. ISBN 0-677-04150-0.
External links
Appell Sequence at MathWorld
Retrieved from "http://en.wikipedia.org/"
All text is available under the terms of the GNU Free Documentation License
Category:
Autor:
Words:
+ For many years, I wasn't really happy with the property I was living in. I thought that the kitchen was too small. I hated trying to cook family meals as it was super stressful in such a tiny space. I used to dream about selling the house and buying somewhere new, but I didn't really have the money and I didn't want to move area with my kids still in school. My friend suggested that I hire a team of construction contractors and ask them to extend the kitchen space out into the backyard. The contractors were great. They demolished one wall and then laid a new foundation for the extension. Next, the walls were constructed and then the roof was installed. I am so happy with my new kitchen. +
+
+[Read More]
+
+[Read More]
+
+[Read More]
+
+[Read More]
+
+[Read More]
+
+[Read More]
+
+[Read More]
+
+[Read More]
+
+[Read More]
+
+[Read More]
+
The Magicians and Magic Hub
+ + +
+ + +| · | ++ + 国学四人谈:从孟子… + + | +2018/05/20 |
| · | ++ + 深切的悼念,永恒的… + + | +2018/05/09 |
| · | ++ + 【唐明邦先生纪念专… + + | +2018/05/08 |
| · | ++ + 【唐明邦先生纪念专… + + | +2018/05/08 |
| · | ++ + 珞珈佛学讲坛:《禅… + + | +2018/04/24 |
| · | ++ + 2018届“方太青竹简… + + | +2018/06/10 |
| · | ++ + 深切的悼念,永恒的… + + | +2018/05/07 |
| · | ++ + 讣告——唐明邦先生… + + | +2018/05/07 |
| · | ++ + “弘毅学堂”国学班… + + | +2017/11/14 |
| · | ++ + 武汉大学国学院“弘… + + | +2017/09/25 |
| · | ++ + 【楚天都市报】于亭… + + | +2018/06/22 |
| · | ++ + 【光明日报】郭齐勇… + + | +2018/06/22 |
| · | ++ + 【澎湃新闻】专访武… + + | +2018/06/22 |
| · | ++ + 【文汇报】郭齐勇:… + + | +2018/06/22 |
| · | ++ + 【文汇讲堂】“接着… + + | +2018/06/22 |
Copyright?2014 武汉大学国学院版权所有 All Rights Reserved. 地址:中国·武汉·珞珈山 邮编:430072
+|
+ РУС |
+|
| + | +|
| + + + + | ++ |
+Thermo+ + |
++ |
+ ![]() |
+ + | ++ | +
+
+ + + |
| + | |||
+
+
+
|
+
+
+
+
+
+
+
+ +AIT Associated Repository of Academic Resources > + + ++ Browsing by Author 中村, 栄治 ++ + + + + +
+
+
+
+
+ Showing results 5 to 24 of 37
+
+
+
+ < previous
+ next >
+
+
+
+
+ Showing results 5 to 24 of 37
+
+
+
+ < previous
+ next >
+
+
+
+
+ + |
+
+ |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
+
|
+ ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||

+
+
+
+ + + + |
+
+
+
+
+
+
+
+
+
++ +
+
+
+
+
+
+
+
++ + |
+
+
+
+
+
+
+
+ + + + +
+ + + + |
+
+
+