diff --git a/llm_web_kit/main_html_parser/parser/layout_batch_parser.py b/llm_web_kit/main_html_parser/parser/layout_batch_parser.py index 5259cea3..5d6ef2e4 100644 --- a/llm_web_kit/main_html_parser/parser/layout_batch_parser.py +++ b/llm_web_kit/main_html_parser/parser/layout_batch_parser.py @@ -115,8 +115,7 @@ def normalize_key(self, tup): return None tag, class_id, idd = tup if class_id: - class_id = re.sub(r' +', ' ', class_id) - + class_id = re.sub(r'[ \t\n]+', ' ', class_id) if idd: valid_id = self.ids.get(idd, True) idd = re.sub(r' +', ' ', idd) @@ -146,9 +145,9 @@ def find_blocks_drop(self, element, depth, element_dict, parent_keyy, parent_lab length = len(self.get_tokens(element.text_content().strip())) length_tail = 0 text = element.xpath('string()').strip() - is_natural_language = self.__is_natural_language(text) or length_tail >= 10 if element.tail: length_tail = len(element.tail.strip()) + is_natural_language = self.__is_natural_language(text) or length_tail >= 10 idd = element.get('id') tag = element.tag layer_nodes = element_dict.get(depth, {}) @@ -276,7 +275,7 @@ def find_blocks_drop(self, element, depth, element_dict, parent_keyy, parent_lab # 判断当前节点是否是红色节点 if keyy in layer_nodes_dict: if 'red' not in layer_nodes_dict[keyy]: - if self.more_noise_enable and tag in ['p', 'ul', 'br'] and not idd and is_natural_language: + if self.more_noise_enable and tag in ['p', 'ul', 'br', 'b'] and not idd and is_natural_language: label = 'red' else: parent = element.getparent() @@ -397,6 +396,7 @@ def __match_tag_class(self, layer_nodes, current_layer_key, parent_key, node_htm def __match_tag(self, layer_nodes, current_layer_key, parent_key, node_html, template_doc, class_must=False, id_exist=False): current_norm_key = (self.normalize_key((current_layer_key[0], None, None)), parent_key) + first_class_res = None, None, None for ele_keyy, ele_value in layer_nodes.items(): # class id要存在 if class_must and not ele_keyy[1]: @@ -432,9 +432,8 @@ def __match_tag(self, layer_nodes, current_layer_key, parent_key, node_html, tem norm_ele_keyy_with_first_class = self.normalize_key((ele_keyy[0], ele_keyy[1].strip().split(' ')[0], None)) norm_ele_keyy_parent_with_first_class = (norm_ele_keyy_with_first_class, ele_parent_keyy) if current_norm_key_with_first_class == norm_ele_keyy_parent_with_first_class: - return ele_label, self.normalize_key(ele_keyy[0:3]), is_drop_tail - - return None, None, None + first_class_res = ele_label, self.normalize_key(ele_keyy[0:3]), is_drop_tail + return first_class_res def __is_natural_language(self, text, min_words=10): """判断文本是否像自然语言. diff --git a/tests/llm_web_kit/main_html_parser/parser/assets/input_layout_batch_parser/test_multi_same_first_class_id.html b/tests/llm_web_kit/main_html_parser/parser/assets/input_layout_batch_parser/test_multi_same_first_class_id.html new file mode 100644 index 00000000..859f2c73 --- /dev/null +++ b/tests/llm_web_kit/main_html_parser/parser/assets/input_layout_batch_parser/test_multi_same_first_class_id.html @@ -0,0 +1,1598 @@ + + + + + + + + + + + + + + + + + Christopher Clements - Spredfast Social Suite and Saloon 2016 + + +
+
+
+
+
+
+
+ +
+
+
+
+
+ +
+
+
+ +
+
+
+ + + + + +
+
Brief
Spredfast wanted to follow it's successful 2015 SXSW presence with a stand-out, expanded suite experience and equally enticing party at the Moody Theater.
Process
We leveraged a public focus on space exploration, along with the recently released NASA Apollo missions image archive to create a truly unique, on-trend event experience that earned Spredfast a seat on CNBC's best brand experiences at SXSW list. Also on the list were brands like McDonald's, Spotify, American Greetings, Gatorade, Visa and Deloitte Digital.
Competencies
Creative direction, Graphic design, Event design, UX design, Illustration
Key Metrics
Event attendance tied to pipeline influence, social engagement, hashtag usage, press coverage, number of sales meetings planned/completed.

+
+ + + + + + + + + + + + +
+
+
+ + + +
+
+ + + +
+
+
+
+ + + + + + + + + + +
+
+
+ + + +
+
+ + + +
+
+
+
+ + + + + + + + + + +
+
+
+ + + +
+
+ + + +
+
+
+
+ + + + + + + + +
+
Theme concepting sketches:
+
+ + + + + + + + + + + + +
+
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+
+
+ + + + + + + + +
+
Style guide:
+
+ + + + + + + + + + + + +
+
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+
+
+ + + + + + + + +
+
Asset examples:
+
+ + + + + + + + + + + + +
+
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+
+
+ + + + + +
+
+
+ +
+
+ +
+ + + + + + + + + + + + + + + +
+
+
+ + + +
+
+ + + +
+
+
+
+ + + + + + + + +
+
Photos from the event:
+
+ + + + + + + + + + + + +
+
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+
+
+ + + +
+
+
+
+
+ Back to Top +
+ + + + + + + +
+ +
+
+
+
+ + + + + + diff --git a/tests/llm_web_kit/main_html_parser/parser/assets/input_layout_batch_parser/test_multi_same_first_class_id.json b/tests/llm_web_kit/main_html_parser/parser/assets/input_layout_batch_parser/test_multi_same_first_class_id.json new file mode 100644 index 00000000..35e642e1 --- /dev/null +++ b/tests/llm_web_kit/main_html_parser/parser/assets/input_layout_batch_parser/test_multi_same_first_class_id.json @@ -0,0 +1,31 @@ +{ + "item_id 1": 1, + "item_id 2": 1, + "item_id 3": 1, + "item_id 4": 1, + "item_id 5": 1, + "item_id 6": 1, + "item_id 7": 1, + "item_id 8": 1, + "item_id 9": 1, + "item_id 10": 1, + "item_id 11": 1, + "item_id 12": 1, + "item_id 13": 1, + "item_id 14": 1, + "item_id 15": 1, + "item_id 16": 1, + "item_id 17": 1, + "item_id 18": 1, + "item_id 19": 0, + "item_id 20": 0, + "item_id 21": 0, + "item_id 22": 0, + "item_id 23": 0, + "item_id 24": 0, + "item_id 25": 0, + "item_id 26": 1, + "item_id 27": 1, + "item_id 28": 1, + "item_id 29": 0 +} \ No newline at end of file diff --git a/tests/llm_web_kit/main_html_parser/parser/assets/input_layout_batch_parser/test_multi_same_first_class_id_tag.html b/tests/llm_web_kit/main_html_parser/parser/assets/input_layout_batch_parser/test_multi_same_first_class_id_tag.html new file mode 100644 index 00000000..504cec0e --- /dev/null +++ b/tests/llm_web_kit/main_html_parser/parser/assets/input_layout_batch_parser/test_multi_same_first_class_id_tag.html @@ -0,0 +1,1250 @@ + + + + + + + + + + + + + + + + +Christopher Clements - Upland Software brand refresh + + +
+
+
+
+
+
+
+ +
+
+
+
+
+ +
+
+
+ +
+
+
+
+
+ +
+
+
+
+
Brief
+
When I arrived at Upland, the company was beginning a major transition in operating style and GTM strategy, and they needed a fresh brand to help re-introduce the company to the world. Additionally, the old brand system was virtually unusable, without many of the tools and trappings that a modern, extensible, and accessible has to have. Additional considerations were a wide variety of stakeholder groups (as Upland is a public company) as well as their decentralized, hub-and-spoke internal structure with certain functions sitting at a 'corporate' level, and individual business unit marketing teams.
+
Solution
+
Upon realizing the scope and scale of this effort, and considering the very limited internal resources, we knew we needed to bring our creative, design, and branding networks to bear on the project. Collaborating across key internal teams, we brought in critical partners Unfettered and The Graphic Standard to get things rolling. Together, we guided key stakeholders through a robust discovery process that helped us land at core company values, brand promise, and stack hands market positioning where we knew Upland could really be competitive. 

From this foundational information, we quickly explored visual expression, honing in on a new visual brand that fit Upland perfectly, and would continue to grow with the complexity and size of the company for years to come. 
+
Competencies
+
Creative direction, vendor management, project planning, outsourcing, graphic design, illustration, UX, education & training
+
Key Metrics
+
Completion of key brand assets (web experience, brand system and components, templates, guidance documents, etc), usability and access, buy-in from key stakeholders and executive leadership, competitive within peer set, clarify values and brand promise
+
+
+
+
Here's a brief look at how Upland was showing up in the market pre-refresh. Drowning in a sea of same, the company's communications were uninspiring, bogged down with superfluous language, and unable to clearly communicate.
+
+
+
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+
+
+
+
Here's a look at some of that foundational work identified early on, focusing the key stakeholders in on values, market positioning, and value proposition / promise: 
+
+
+
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+
+
+
+
Moving into visual exploration, the design team was able to guide the stakeholders towards the most successful execution (also our favorite) and sold the new direction through with little meaningful friction. 
+
+
+
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+
+
+
+
Here's a closer look into a few of the refreshed brand system components, starting with the refreshed logo and hierarchy:
+
+
+
+
+ + + +
+
+ + + +
+
+
+
+
+
+ +
+
+
+
+ +
+
+
+
+ +
+
+
+
Simple linework illustrations we call 'elements' act as grounding and foundational visuals that help reinforce feelings of order, intention, logic, and precision. These are used in a variety of ways across the brand system, further extending the number of visual options users have at their disposal. 
+
+
+
+ +
+
+
+
Instead of leveraging traditional iconography sets, we opted to introduce 'geometries'...small, glyph-like visuals that unlock more conceptual storytelling capabilities, especially when used in the context of presentation. Rather than finding that 'perfect' icon from a set that aligns to your messaging or story, meaning is assigned to more abstract forms, again nodding to the flexibility of this system.
+
+
+
+ +
+
+
+
Illustration plays a big part in the brand system. By leveraging the foundational components of line work, color, simple shape, and our unique slanted angle, we arrive at an illustration style that's own-able for the brand. Additionally, this approach allows us to quickly conceptualize and showcase product features for newly acquired tech, something Upland does 4-6 times per calendar year. 
+
+
+
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+
+
+
+
Turning up the fidelity on the illustrations one more notch, we can use the same direction to simplify and stylize screenshots. This is incredibly important to promote a sense of consistency across the products, as it takes time to transition a newly acquired product into the new design language system. In the mean time, we can leverage stylized screens to communicate functionality, while also reducing visual complexity and focusing the viewer on specific features:
+
+
+
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+
+
+
+
From the very start, we knew that this illustration direction opened up huge opportunities for motion. We've had a few opportunities to experiment with full videos as well as looping GIFs, with very promising results. 
+
+
+
+
+ +
+
+
+
+
+ +
+
+
+
+
+ +
+
+
+
+
An important part of any brand system are it's templates, and there was certainly no shortage of template needs for Upland. Working with our newly defined brand system, we engaged internal and external designers to create a library of templates covering a wide variety of use cases.
+
+
+
Social and display ads:
+
+
+
+
+ + + +
+
+ + + +
+
+
+
+
+
Documentation and sales enablement:
+
+
+
+
+ + + +
+
+ + + +
+
+
+
+
+
eBook and Whitepaper:
+
+
+
+
+ + + +
+
+ + + +
+
+
+
+
+
Case study and Infographic:
+
+
+
+
+ + + +
+
+ + + +
+
+
+
+
+
Powerpoint and Salesforce App Exchange:
+
+
+
+
+ + + +
+
+ + + +
+
+
+
+
+
Email and service plans:
+
+
+
+
+ + + +
+
+ + + +
+
+ + + +
+
+
+
+
+
and of course all the obligatory office-type documents:
+
+
+
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+
+
+
+
One of the most signifiant changes was our web presence, as evidenced by the stark difference between old and new home pages:
+
+
+
+
+
+
+ +
+
+
+
+
+
+ +
+
+
+
+
+
+
To get a taste for the various modules and layouts that we implemented for the new site, check out the Figma project and prototype below:
+
+
+
+
+ +
+
+
+
+
+
+ +
+
+
+
+
With a project as massive as this, it 100% takes an army of talented folks to get it done. My sincerest thanks go out to key internal and external players who helped bring this new brand to life:

Always-resilient Uplanders:
Jim Rudden, Virginia Miracle, Meredith Begin, Rod Favaron, Kendell Kelton, Sara Whitwer, Justin Schiavoni, Rachel Quinn, Daiko Hachiya & the product design team.

Design and branding legends who lent their time and talents to build out system:
Brett Eaton and Sharon Arellano and the Unfettered team, Shane Bzdok, John Norton, Gray Luckett and The Graphic Standard team, Rex Burns, Tom Reardon, Courtney Boyle, Annalee Lanier, Paulo Selletti @ Hypnotic Design, Dustin Scott @ GreatJob.TV, Megan Willin, Maggie Moore, Mariella Krause, Scott McAfee, Zoe Randolph, Barry Epstein, Todd Kelgard and the Ovation Solutions team.

Y'all moved mountains. Thank you.
+
+
+
+
+
+
+Back to Top +
+ + + + + + + +
+ +
+
+
+
+ + + + + + + diff --git a/tests/llm_web_kit/main_html_parser/parser/test_layout_parser.py b/tests/llm_web_kit/main_html_parser/parser/test_layout_parser.py index d936fe89..c7a4df5f 100644 --- a/tests/llm_web_kit/main_html_parser/parser/test_layout_parser.py +++ b/tests/llm_web_kit/main_html_parser/parser/test_layout_parser.py @@ -420,3 +420,30 @@ def test_all_ids(self): parts = parser.parse(pre_data) main_html_body = parts[PreDataJsonKey.MAIN_HTML_BODY] assert '全部按定尺或倍尺供應,提高材料的利用率' in main_html_body and '在線留言' not in main_html_body and '批發兼零售' not in main_html_body + + def test_multi_same_first_class_id(self): + # 构造测试html + typical_raw_tag_html = base_dir.joinpath('assets/input_layout_batch_parser/test_multi_same_first_class_id_tag.html').read_text( + encoding='utf-8') + html_source = base_dir.joinpath('assets/input_layout_batch_parser/test_multi_same_first_class_id.html').read_text( + encoding='utf-8') + # 简化网页 + # 模型结果格式改写 + llm_path = 'assets/input_layout_batch_parser/test_multi_same_first_class_id.json' + llm_response = json.loads(base_dir.joinpath(llm_path).read_text(encoding='utf-8')) + pre_data = {'typical_raw_tag_html': typical_raw_tag_html, 'typical_raw_html': typical_raw_tag_html, + 'llm_response': llm_response, 'html_source': html_source} + pre_data = PreDataJson(pre_data) + # 映射 + parser = MapItemToHtmlTagsParser({}) + pre_data = parser.parse(pre_data) + + # 推广 + pre_data[PreDataJsonKey.DYNAMIC_ID_ENABLE] = True + pre_data[PreDataJsonKey.DYNAMIC_CLASSID_ENABLE] = True + pre_data[PreDataJsonKey.MORE_NOISE_ENABLE] = True + parser = LayoutBatchParser({}) + parts = parser.parse(pre_data) + main_html_body = parts[PreDataJsonKey.MAIN_HTML_BODY] + print(main_html_body) + assert 'Spredfast wanted to follow' in main_html_body and 'Photography' not in main_html_body