From d49650c54ffa60bc6d6106970e104071689bc7b0 Mon Sep 17 00:00:00 2001
From: Jinfeng Sun <86536994+Tendo33@users.noreply.github.com>
Date: Tue, 7 Jan 2025 20:58:10 +0800
Subject: [PATCH] fix(mspowerpoint): handle invalid images in PowerPoint slides
 (#650)

- Add error handling for images that cannot be loaded by Pillow
- Improve resilience when encountering corrupted or unsupported image formats
- Maintain processing of other slide elements even if an image fails to load

Signed-off-by: Tendo33 <sjf1998112@gmail.com>
---
 docling/backend/mspowerpoint_backend.py | 26 ++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py
index f595e4bd..995969d4 100644
--- a/docling/backend/mspowerpoint_backend.py
+++ b/docling/backend/mspowerpoint_backend.py
@@ -16,7 +16,7 @@
     TableCell,
     TableData,
 )
-from PIL import Image
+from PIL import Image, UnidentifiedImageError
 from pptx import Presentation
 from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
 
@@ -120,6 +120,7 @@ def handle_text_elements(self, shape, parent_slide, slide_ind, doc):
         bullet_type = "None"
         list_text = ""
         list_label = GroupLabel.LIST
+        doc_label = DocItemLabel.LIST_ITEM
         prov = self.generate_prov(shape, slide_ind, shape.text.strip())
 
         # Identify if shape contains lists
@@ -276,16 +277,19 @@ def handle_pictures(self, shape, parent_slide, slide_ind, doc):
         im_dpi, _ = image.dpi
 
         # Open it with PIL
-        pil_image = Image.open(BytesIO(image_bytes))
-
-        # shape has picture
-        prov = self.generate_prov(shape, slide_ind, "")
-        doc.add_picture(
-            parent=parent_slide,
-            image=ImageRef.from_pil(image=pil_image, dpi=im_dpi),
-            caption=None,
-            prov=prov,
-        )
+        try:
+            pil_image = Image.open(BytesIO(image_bytes))
+
+            # shape has picture
+            prov = self.generate_prov(shape, slide_ind, "")
+            doc.add_picture(
+                parent=parent_slide,
+                image=ImageRef.from_pil(image=pil_image, dpi=im_dpi),
+                caption=None,
+                prov=prov,
+            )
+        except (UnidentifiedImageError, OSError) as e:
+            _log.warning(f"Warning: image cannot be loaded by Pillow: {e}")
         return
 
     def handle_tables(self, shape, parent_slide, slide_ind, doc):