From 81c8243a8bf177feed8f87ea283b5bb6836350cb Mon Sep 17 00:00:00 2001
From: Maxim Lysak <101627549+maxmnemonic@users.noreply.github.com>
Date: Mon, 11 Nov 2024 16:38:21 +0100
Subject: [PATCH] fix: Added handling of grouped elements in pptx backend
 (#307)

* Added handling of grouped elements in pptx backend

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* updated log.warn to warning

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

---------

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
Co-authored-by: Maksym Lysak <mly@zurich.ibm.com>
---
 docling/backend/mspowerpoint_backend.py | 39 +++++++++++--------------
 1 file changed, 17 insertions(+), 22 deletions(-)

diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py
index cbec761c..b71cd859 100644
--- a/docling/backend/mspowerpoint_backend.py
+++ b/docling/backend/mspowerpoint_backend.py
@@ -358,41 +358,36 @@ def walk_linear(self, pptx_obj, doc) -> DoclingDocument:
 
             size = Size(width=slide_width, height=slide_height)
             parent_page = doc.add_page(page_no=slide_ind + 1, size=size)
-            # parent_page = doc.add_page(page_no=slide_ind, size=size, hash=hash)
-
-            # Loop through each shape in the slide
-            for shape in slide.shapes:
 
+            def handle_shapes(shape, parent_slide, slide_ind, doc):
+                handle_groups(shape, parent_slide, slide_ind, doc)
                 if shape.has_table:
                     # Handle Tables
                     self.handle_tables(shape, parent_slide, slide_ind, doc)
-
                 if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
-                    # Handle Tables
+                    # Handle Pictures
                     self.handle_pictures(shape, parent_slide, slide_ind, doc)
-
                 # If shape doesn't have any text, move on to the next shape
                 if not hasattr(shape, "text"):
-                    continue
+                    return
                 if shape.text is None:
-                    continue
+                    return
                 if len(shape.text.strip()) == 0:
-                    continue
+                    return
                 if not shape.has_text_frame:
-                    _log.warn("Warning: shape has text but not text_frame")
-                    continue
-
-                # if shape.is_placeholder:
-                # Handle Titles (Headers) and Subtitles
-                # Check if the shape is a placeholder (titles are placeholders)
-                # self.handle_title(shape, parent_slide, slide_ind, doc)
-                # self.handle_text_elements(shape, parent_slide, slide_ind, doc)
-                # else:
-
+                    _log.warning("Warning: shape has text but not text_frame")
+                    return
                 # Handle other text elements, including lists (bullet lists, numbered lists)
                 self.handle_text_elements(shape, parent_slide, slide_ind, doc)
+                return
+
+            def handle_groups(shape, parent_slide, slide_ind, doc):
+                if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
+                    for groupedshape in shape.shapes:
+                        handle_shapes(groupedshape, parent_slide, slide_ind, doc)
 
-                # figures...
-                # doc.add_figure(data=BaseFigureData(), parent=self.parents[self.level], caption=None)
+            # Loop through each shape in the slide
+            for shape in slide.shapes:
+                handle_shapes(shape, parent_slide, slide_ind, doc)
 
         return doc