Skip to content

Commit

Permalink
fix: Handling of single-cell tables in DOCX backend (#314)
Browse files Browse the repository at this point in the history
* Handling of single-cell tables in DOCX backend

Signed-off-by: Maksym Lysak <[email protected]>

* returned try-catch on tables handling

Signed-off-by: Maksym Lysak <[email protected]>

* cleaned

Signed-off-by: Maksym Lysak <[email protected]>

* proceed processing the content of single cell table as if its just part of the body

Signed-off-by: Maksym Lysak <[email protected]>

* Added example of trickly 1 cell table docx

Signed-off-by: Maksym Lysak <[email protected]>

---------

Signed-off-by: Maksym Lysak <[email protected]>
Co-authored-by: Maksym Lysak <[email protected]>
  • Loading branch information
maxmnemonic and Maksym Lysak authored Nov 12, 2024
1 parent 7f5d35e commit fb8ba86
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 8 deletions.
16 changes: 8 additions & 8 deletions docling/backend/msword_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,6 @@ def get_level(self) -> int:
def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
for element in body:
tag_name = etree.QName(element).localname

# Check for Inline Images (drawings or blip elements)
found_drawing = etree.ElementBase.xpath(
element, ".//w:drawing", namespaces=self.xml_namespaces
Expand Down Expand Up @@ -201,7 +200,6 @@ def get_label_and_level(self, paragraph):
label_str = ""
label_level = 0
if parts[0] == "Heading":
# print("{} - {}".format(parts[0], parts[1]))
label_str = parts[0]
label_level = self.str_to_int(parts[1], default=None)
if parts[1] == "Heading":
Expand All @@ -217,19 +215,16 @@ def handle_text_elements(self, element, docx_obj, doc):
if paragraph.text is None:
# _log.warn(f"paragraph has text==None")
return

text = paragraph.text.strip()
# if len(text)==0 # keep empty paragraphs, they seperate adjacent lists!

# Common styles for bullet and numbered lists.
# "List Bullet", "List Number", "List Paragraph"
# TODO: reliably identify wether list is a numbered list or not
# Identify wether list is a numbered list or not
# is_numbered = "List Bullet" not in paragraph.style.name
is_numbered = False

p_style_name, p_level = self.get_label_and_level(paragraph)
numid, ilevel = self.get_numId_and_ilvl(paragraph)
# print("numid: {}, ilevel: {}, text: {}".format(numid, ilevel, text))

if numid == 0:
numid = None
Expand Down Expand Up @@ -450,8 +445,13 @@ def get_rowspan(cell):
for row in table.rows:
# Calculate the max number of columns
num_cols = max(num_cols, sum(get_colspan(cell) for cell in row.cells))
# if row.cells:
# num_cols = max(num_cols, len(row.cells))

if num_rows == 1 and num_cols == 1:
cell_element = table.rows[0].cells[0]
# In case we have a table of only 1 cell, we consider it furniture
# And proceed processing the content of the cell as though it's in the document body
self.walk_linear(cell_element._element, docx_obj, doc)
return

# Initialize the table grid
table_grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
Expand Down
Binary file added tests/data/docx/tablecell.docx
Binary file not shown.

0 comments on commit fb8ba86

Please sign in to comment.