Skip to content

Commit

Permalink
fix: Fixes for wordx (#432)
Browse files Browse the repository at this point in the history
* fixes for referencing drawing blip in wordx

Signed-off-by: Maksym Lysak <[email protected]>

* Added safety try-except when trying to load pillow image from a docx blob. Added explicit dependency on lxml.

Signed-off-by: Maksym Lysak <[email protected]>

* Added test for word file with embedded emf images, re-generated full tests for docx, eased up dependency on lxml

Signed-off-by: Maksym Lysak <[email protected]>

* Updated lxml dependency version

Signed-off-by: Maksym Lysak <[email protected]>

---------

Signed-off-by: Maksym Lysak <[email protected]>
Co-authored-by: Maksym Lysak <[email protected]>
  • Loading branch information
maxmnemonic and Maksym Lysak authored Nov 26, 2024
1 parent d7072b4 commit d0a1180
Show file tree
Hide file tree
Showing 13 changed files with 1,296 additions and 153 deletions.
31 changes: 22 additions & 9 deletions docling/backend/msword_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
TableData,
)
from lxml import etree
from PIL import Image
from lxml.etree import XPath
from PIL import Image, UnidentifiedImageError

from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
Expand Down Expand Up @@ -132,8 +133,14 @@ def get_level(self) -> int:
def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
for element in body:
tag_name = etree.QName(element).localname

# Check for Inline Images (blip elements)
drawing_blip = element.xpath(".//a:blip")
namespaces = {
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
"r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
}
xpath_expr = XPath(".//a:blip", namespaces=namespaces)
drawing_blip = xpath_expr(element)

# Check for Tables
if element.tag.endswith("tbl"):
Expand Down Expand Up @@ -210,7 +217,6 @@ def handle_text_elements(self, element, docx_obj, doc):
paragraph = docx.text.paragraph.Paragraph(element, docx_obj)

if paragraph.text is None:
# _log.warn(f"paragraph has text==None")
return
text = paragraph.text.strip()
# if len(text)==0 # keep empty paragraphs, they seperate adjacent lists!
Expand Down Expand Up @@ -502,10 +508,17 @@ def get_docx_image(element, drawing_blip):
image_data = get_docx_image(element, drawing_blip)
image_bytes = BytesIO(image_data)
# Open the BytesIO object with PIL to create an Image
pil_image = Image.open(image_bytes)
doc.add_picture(
parent=self.parents[self.level],
image=ImageRef.from_pil(image=pil_image, dpi=72),
caption=None,
)
try:
pil_image = Image.open(image_bytes)
doc.add_picture(
parent=self.parents[self.level],
image=ImageRef.from_pil(image=pil_image, dpi=72),
caption=None,
)
except (UnidentifiedImageError, OSError) as e:
_log.warning("Warning: image cannot be loaded by Pillow")
doc.add_picture(
parent=self.parents[self.level],
caption=None,
)
return
247 changes: 103 additions & 144 deletions poetry.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ beautifulsoup4 = "^4.12.3"
pandas = "^2.1.4"
marko = "^2.1.2"
openpyxl = "^3.1.5"
lxml = ">=4.0.0,<6.0.0"
ocrmac = { version = "^1.0.0", markers = "sys_platform == 'darwin'", optional = true }

[tool.poetry.group.dev.dependencies]
Expand Down
Binary file added tests/data/docx/test_emf_docx.docx
Binary file not shown.
10 changes: 10 additions & 0 deletions tests/data/groundtruth/docling_v2/tablecell.docx.itxt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: list: group list
item-2 at level 2: list_item: Hello world1
item-3 at level 2: list_item: Hello2
item-4 at level 1: paragraph:
item-5 at level 1: paragraph: Some text before
item-6 at level 1: table with [3x3]
item-7 at level 1: paragraph:
item-8 at level 1: paragraph:
item-9 at level 1: paragraph: Some text after
Loading

0 comments on commit d0a1180

Please sign in to comment.