Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 6 additions & 7 deletions llm_web_kit/main_html_parser/parser/layout_batch_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,7 @@ def normalize_key(self, tup):
return None
tag, class_id, idd = tup
if class_id:
class_id = re.sub(r' +', ' ', class_id)

class_id = re.sub(r'[ \t\n]+', ' ', class_id)
if idd:
valid_id = self.ids.get(idd, True)
idd = re.sub(r' +', ' ', idd)
Expand Down Expand Up @@ -146,9 +145,9 @@ def find_blocks_drop(self, element, depth, element_dict, parent_keyy, parent_lab
length = len(self.get_tokens(element.text_content().strip()))
length_tail = 0
text = element.xpath('string()').strip()
is_natural_language = self.__is_natural_language(text) or length_tail >= 10
if element.tail:
length_tail = len(element.tail.strip())
is_natural_language = self.__is_natural_language(text) or length_tail >= 10
idd = element.get('id')
tag = element.tag
layer_nodes = element_dict.get(depth, {})
Expand Down Expand Up @@ -276,7 +275,7 @@ def find_blocks_drop(self, element, depth, element_dict, parent_keyy, parent_lab
# 判断当前节点是否是红色节点
if keyy in layer_nodes_dict:
if 'red' not in layer_nodes_dict[keyy]:
if self.more_noise_enable and tag in ['p', 'ul', 'br'] and not idd and is_natural_language:
if self.more_noise_enable and tag in ['p', 'ul', 'br', 'b'] and not idd and is_natural_language:
label = 'red'
else:
parent = element.getparent()
Expand Down Expand Up @@ -397,6 +396,7 @@ def __match_tag_class(self, layer_nodes, current_layer_key, parent_key, node_htm
def __match_tag(self, layer_nodes, current_layer_key, parent_key, node_html, template_doc, class_must=False,
id_exist=False):
current_norm_key = (self.normalize_key((current_layer_key[0], None, None)), parent_key)
first_class_res = None, None, None
for ele_keyy, ele_value in layer_nodes.items():
# class id要存在
if class_must and not ele_keyy[1]:
Expand Down Expand Up @@ -432,9 +432,8 @@ def __match_tag(self, layer_nodes, current_layer_key, parent_key, node_html, tem
norm_ele_keyy_with_first_class = self.normalize_key((ele_keyy[0], ele_keyy[1].strip().split(' ')[0], None))
norm_ele_keyy_parent_with_first_class = (norm_ele_keyy_with_first_class, ele_parent_keyy)
if current_norm_key_with_first_class == norm_ele_keyy_parent_with_first_class:
return ele_label, self.normalize_key(ele_keyy[0:3]), is_drop_tail

return None, None, None
first_class_res = ele_label, self.normalize_key(ele_keyy[0:3]), is_drop_tail
return first_class_res

def __is_natural_language(self, text, min_words=10):
"""判断文本是否像自然语言.
Expand Down

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
{
"item_id 1": 1,
"item_id 2": 1,
"item_id 3": 1,
"item_id 4": 1,
"item_id 5": 1,
"item_id 6": 1,
"item_id 7": 1,
"item_id 8": 1,
"item_id 9": 1,
"item_id 10": 1,
"item_id 11": 1,
"item_id 12": 1,
"item_id 13": 1,
"item_id 14": 1,
"item_id 15": 1,
"item_id 16": 1,
"item_id 17": 1,
"item_id 18": 1,
"item_id 19": 0,
"item_id 20": 0,
"item_id 21": 0,
"item_id 22": 0,
"item_id 23": 0,
"item_id 24": 0,
"item_id 25": 0,
"item_id 26": 1,
"item_id 27": 1,
"item_id 28": 1,
"item_id 29": 0
}

Large diffs are not rendered by default.

27 changes: 27 additions & 0 deletions tests/llm_web_kit/main_html_parser/parser/test_layout_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,3 +420,30 @@ def test_all_ids(self):
parts = parser.parse(pre_data)
main_html_body = parts[PreDataJsonKey.MAIN_HTML_BODY]
assert '全部按定尺或倍尺供應,提高材料的利用率' in main_html_body and '在線留言' not in main_html_body and '批發兼零售' not in main_html_body

def test_multi_same_first_class_id(self):
# 构造测试html
typical_raw_tag_html = base_dir.joinpath('assets/input_layout_batch_parser/test_multi_same_first_class_id_tag.html').read_text(
encoding='utf-8')
html_source = base_dir.joinpath('assets/input_layout_batch_parser/test_multi_same_first_class_id.html').read_text(
encoding='utf-8')
# 简化网页
# 模型结果格式改写
llm_path = 'assets/input_layout_batch_parser/test_multi_same_first_class_id.json'
llm_response = json.loads(base_dir.joinpath(llm_path).read_text(encoding='utf-8'))
pre_data = {'typical_raw_tag_html': typical_raw_tag_html, 'typical_raw_html': typical_raw_tag_html,
'llm_response': llm_response, 'html_source': html_source}
pre_data = PreDataJson(pre_data)
# 映射
parser = MapItemToHtmlTagsParser({})
pre_data = parser.parse(pre_data)

# 推广
pre_data[PreDataJsonKey.DYNAMIC_ID_ENABLE] = True
pre_data[PreDataJsonKey.DYNAMIC_CLASSID_ENABLE] = True
pre_data[PreDataJsonKey.MORE_NOISE_ENABLE] = True
parser = LayoutBatchParser({})
parts = parser.parse(pre_data)
main_html_body = parts[PreDataJsonKey.MAIN_HTML_BODY]
print(main_html_body)
assert 'Spredfast wanted to follow' in main_html_body and 'Photography' not in main_html_body