From b730b2d7a04a8773a00ed88889d28b0c476ba052 Mon Sep 17 00:00:00 2001 From: Maxim Lysak <101627549+maxmnemonic@users.noreply.github.com> Date: Fri, 6 Dec 2024 12:37:25 +0100 Subject: [PATCH] fix: Missing text in docx (t tag) when embedded in a table (#528) Fix for missing text in docx (t tag) when embedded in a table Signed-off-by: Maksym Lysak Co-authored-by: Maksym Lysak --- docling/backend/msword_backend.py | 33 +++++++++++++++++++------------ 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 05508712..bab956a7 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -133,7 +133,6 @@ def get_level(self) -> int: def walk_linear(self, body, docx_obj, doc) -> DoclingDocument: for element in body: tag_name = etree.QName(element).localname - # Check for Inline Images (blip elements) namespaces = { "a": "http://schemas.openxmlformats.org/drawingml/2006/main", @@ -153,6 +152,7 @@ def walk_linear(self, body, docx_obj, doc) -> DoclingDocument: self.handle_pictures(element, docx_obj, drawing_blip, doc) # Check for Text elif tag_name in ["p"]: + # "tcPr", "sectPr" self.handle_text_elements(element, docx_obj, doc) else: _log.debug(f"Ignoring element in DOCX with tag: {tag_name}") @@ -219,7 +219,6 @@ def handle_text_elements(self, element, docx_obj, doc): if paragraph.text is None: return text = paragraph.text.strip() - # if len(text)==0 # keep empty paragraphs, they seperate adjacent lists! # Common styles for bullet and numbered lists. # "List Bullet", "List Number", "List Paragraph" @@ -291,9 +290,7 @@ def handle_text_elements(self, element, docx_obj, doc): def add_header(self, element, docx_obj, doc, curr_name, curr_level, text: str): level = self.get_level() if isinstance(curr_level, int): - if curr_level > level: - # add invisible group for i in range(level, curr_level): self.parents[i] = doc.add_group( @@ -301,9 +298,7 @@ def add_header(self, element, docx_obj, doc, curr_name, curr_level, text: str): label=GroupLabel.SECTION, name=f"header-{i}", ) - elif curr_level < level: - # remove the tail for key, val in self.parents.items(): if key >= curr_level: @@ -314,7 +309,6 @@ def add_header(self, element, docx_obj, doc, curr_name, curr_level, text: str): text=text, level=curr_level, ) - else: self.parents[self.level] = doc.add_heading( parent=self.parents[self.level - 1], @@ -346,7 +340,7 @@ def add_listitem( label=GroupLabel.LIST, name="list", parent=self.parents[level - 1] ) - # TODO: Set marker and enumerated arguments if this is an enumeration element. + # Set marker and enumerated arguments if this is an enumeration element. self.listIter += 1 if is_numbered: enum_marker = str(self.listIter) + "." @@ -365,8 +359,8 @@ def add_listitem( self.level_at_new_list + self.prev_indent() + 1, self.level_at_new_list + ilevel + 1, ): - # TODO: determine if this is an unordered list or an ordered list. - # Set GroupLabel.ORDERED_LIST when it fits. + # Determine if this is an unordered list or an ordered list. + # Set GroupLabel.ORDERED_LIST when it fits. self.listIter = 0 if is_numbered: self.parents[i] = doc.add_group( @@ -467,6 +461,19 @@ def get_rowspan(cell): row_span = get_rowspan(cell) col_span = get_colspan(cell) + cell_text = cell.text + # In case cell doesn't return text via docx library: + if len(cell_text) == 0: + cell_xml = cell._element + + texts = [""] + for elem in cell_xml.iter(): + if elem.tag.endswith("t"): # tags that contain text + if elem.text: + texts.append(elem.text) + # Join the collected text + cell_text = " ".join(texts).strip() + # Find the next available column in the grid while table_grid[row_idx][col_idx] is not None: col_idx += 1 @@ -477,15 +484,15 @@ def get_rowspan(cell): table_grid[row_idx + i][col_idx + j] = "" cell = TableCell( - text=cell.text, + text=cell_text, row_span=row_span, col_span=col_span, start_row_offset_idx=row_idx, end_row_offset_idx=row_idx + row_span, start_col_offset_idx=col_idx, end_col_offset_idx=col_idx + col_span, - col_header=False, # col_header, - row_header=False, # ((not col_header) and html_cell.name=='th') + col_header=False, + row_header=False, ) data.table_cells.append(cell)