From 3e073dfbebbc65f995d4df946c1650699a26782c Mon Sep 17 00:00:00 2001 From: Maxim Lysak <101627549+maxmnemonic@users.noreply.github.com> Date: Fri, 6 Dec 2024 15:17:56 +0100 Subject: [PATCH] feat(MS Word backend): Make detection of headers and other styles localization agnostic (#534) Using style id instead of style names, which should be localization agnostic Signed-off-by: Maksym Lysak Co-authored-by: Maksym Lysak --- docling/backend/msword_backend.py | 35 +++++++++++++++++++------------ 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index bab956a7..037ba005 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -1,4 +1,5 @@ import logging +import re from io import BytesIO from pathlib import Path from typing import Set, Union @@ -166,6 +167,14 @@ def str_to_int(self, s, default=0): except ValueError: return default + def split_text_and_number(self, input_string): + match = re.match(r"(\D+)(\d+)$|^(\d+)(\D+)", input_string) + if match: + parts = list(filter(None, match.groups())) + return parts + else: + return [input_string] + def get_numId_and_ilvl(self, paragraph): # Access the XML element of the paragraph numPr = paragraph._element.find( @@ -188,7 +197,7 @@ def get_numId_and_ilvl(self, paragraph): def get_label_and_level(self, paragraph): if paragraph.style is None: return "Normal", None - label = paragraph.style.name + label = paragraph.style.style_id if label is None: return "Normal", None if ":" in label: @@ -197,7 +206,7 @@ def get_label_and_level(self, paragraph): if len(parts) == 2: return parts[0], int(parts[1]) - parts = label.split(" ") + parts = self.split_text_and_number(label) if "Heading" in label and len(parts) == 2: parts.sort() @@ -225,7 +234,7 @@ def handle_text_elements(self, element, docx_obj, doc): # Identify wether list is a numbered list or not # is_numbered = "List Bullet" not in paragraph.style.name is_numbered = False - p_style_name, p_level = self.get_label_and_level(paragraph) + p_style_id, p_level = self.get_label_and_level(paragraph) numid, ilevel = self.get_numId_and_ilvl(paragraph) if numid == 0: @@ -237,14 +246,14 @@ def handle_text_elements(self, element, docx_obj, doc): element, docx_obj, doc, - p_style_name, + p_style_id, p_level, numid, ilevel, text, is_numbered, ) - self.update_history(p_style_name, p_level, numid, ilevel) + self.update_history(p_style_id, p_level, numid, ilevel) return elif numid is None and self.prev_numid() is not None: # Close list for key, val in self.parents.items(): @@ -252,23 +261,23 @@ def handle_text_elements(self, element, docx_obj, doc): self.parents[key] = None self.level = self.level_at_new_list - 1 self.level_at_new_list = None - if p_style_name in ["Title"]: + if p_style_id in ["Title"]: for key, val in self.parents.items(): self.parents[key] = None self.parents[0] = doc.add_text( parent=None, label=DocItemLabel.TITLE, text=text ) - elif "Heading" in p_style_name: - self.add_header(element, docx_obj, doc, p_style_name, p_level, text) + elif "Heading" in p_style_id: + self.add_header(element, docx_obj, doc, p_style_id, p_level, text) - elif p_style_name in [ + elif p_style_id in [ "Paragraph", "Normal", "Subtitle", "Author", "Default Text", - "List Paragraph", - "List Bullet", + "ListParagraph", + "ListBullet", "Quote", ]: level = self.get_level() @@ -284,7 +293,7 @@ def handle_text_elements(self, element, docx_obj, doc): label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text ) - self.update_history(p_style_name, p_level, numid, ilevel) + self.update_history(p_style_id, p_level, numid, ilevel) return def add_header(self, element, docx_obj, doc, curr_name, curr_level, text: str): @@ -322,7 +331,7 @@ def add_listitem( element, docx_obj, doc, - p_style_name, + p_style_id, p_level, numid, ilevel,