From 81c8243a8bf177feed8f87ea283b5bb6836350cb Mon Sep 17 00:00:00 2001 From: Maxim Lysak <101627549+maxmnemonic@users.noreply.github.com> Date: Mon, 11 Nov 2024 16:38:21 +0100 Subject: [PATCH] fix: Added handling of grouped elements in pptx backend (#307) * Added handling of grouped elements in pptx backend Signed-off-by: Maksym Lysak * updated log.warn to warning Signed-off-by: Maksym Lysak --------- Signed-off-by: Maksym Lysak Co-authored-by: Maksym Lysak --- docling/backend/mspowerpoint_backend.py | 39 +++++++++++-------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py index cbec761c..b71cd859 100644 --- a/docling/backend/mspowerpoint_backend.py +++ b/docling/backend/mspowerpoint_backend.py @@ -358,41 +358,36 @@ def walk_linear(self, pptx_obj, doc) -> DoclingDocument: size = Size(width=slide_width, height=slide_height) parent_page = doc.add_page(page_no=slide_ind + 1, size=size) - # parent_page = doc.add_page(page_no=slide_ind, size=size, hash=hash) - - # Loop through each shape in the slide - for shape in slide.shapes: + def handle_shapes(shape, parent_slide, slide_ind, doc): + handle_groups(shape, parent_slide, slide_ind, doc) if shape.has_table: # Handle Tables self.handle_tables(shape, parent_slide, slide_ind, doc) - if shape.shape_type == MSO_SHAPE_TYPE.PICTURE: - # Handle Tables + # Handle Pictures self.handle_pictures(shape, parent_slide, slide_ind, doc) - # If shape doesn't have any text, move on to the next shape if not hasattr(shape, "text"): - continue + return if shape.text is None: - continue + return if len(shape.text.strip()) == 0: - continue + return if not shape.has_text_frame: - _log.warn("Warning: shape has text but not text_frame") - continue - - # if shape.is_placeholder: - # Handle Titles (Headers) and Subtitles - # Check if the shape is a placeholder (titles are placeholders) - # self.handle_title(shape, parent_slide, slide_ind, doc) - # self.handle_text_elements(shape, parent_slide, slide_ind, doc) - # else: - + _log.warning("Warning: shape has text but not text_frame") + return # Handle other text elements, including lists (bullet lists, numbered lists) self.handle_text_elements(shape, parent_slide, slide_ind, doc) + return + + def handle_groups(shape, parent_slide, slide_ind, doc): + if shape.shape_type == MSO_SHAPE_TYPE.GROUP: + for groupedshape in shape.shapes: + handle_shapes(groupedshape, parent_slide, slide_ind, doc) - # figures... - # doc.add_figure(data=BaseFigureData(), parent=self.parents[self.level], caption=None) + # Loop through each shape in the slide + for shape in slide.shapes: + handle_shapes(shape, parent_slide, slide_ind, doc) return doc