Merge branch 'main' into rtdl/export_latex_docx

Signed-off-by: Rafael Teixeira de Lima <[email protected]>
DS4SD · Jan 27, 2025 · 30d0afe · 30d0afe
2 parents c32c3b6 + 8a4ec77
commit 30d0afe
Show file tree

Hide file tree

Showing 65 changed files with 3,380 additions and 301 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,25 @@
+## [v2.16.0](https://github.com/DS4SD/docling/releases/tag/v2.16.0) - 2025-01-24
+
+### Feature
+
+* New document picture classifier ([#805](https://github.com/DS4SD/docling/issues/805)) ([`16a218d`](https://github.com/DS4SD/docling/commit/16a218d871c48fd9cc636b77f7b597dc40cbeeec))
+* Add Docling JSON ingestion ([#783](https://github.com/DS4SD/docling/issues/783)) ([`88a0e66`](https://github.com/DS4SD/docling/commit/88a0e66adc19238f57a942b0504926cdaeacd8cc))
+* Code and equation model for PDF and code blocks in markdown ([#752](https://github.com/DS4SD/docling/issues/752)) ([`3213b24`](https://github.com/DS4SD/docling/commit/3213b247ad6870ff984271f09f7720be68d9479b))
+* Add "auto" language for TesseractOcr ([#759](https://github.com/DS4SD/docling/issues/759)) ([`8543c22`](https://github.com/DS4SD/docling/commit/8543c22687fee40459d393bf4adcfc059712de02))
+
+### Fix
+
+* Added extraction of byte-images in excel ([#804](https://github.com/DS4SD/docling/issues/804)) ([`a458e29`](https://github.com/DS4SD/docling/commit/a458e298ca64da2c6df29d953e95645525817bed))
+* Update docling-parse-v2 backend version with new parsing fixes ([#769](https://github.com/DS4SD/docling/issues/769)) ([`670a08b`](https://github.com/DS4SD/docling/commit/670a08bdedda847ff3b6942bcaa1a2adef79afe2))
+
+### Documentation
+
+* Fix minor typos ([#801](https://github.com/DS4SD/docling/issues/801)) ([`c58f75d`](https://github.com/DS4SD/docling/commit/c58f75d0f75040e32820cc2915ec00755211c02f))
+* Add Azure RAG example ([#675](https://github.com/DS4SD/docling/issues/675)) ([`9020a93`](https://github.com/DS4SD/docling/commit/9020a934be35b0798c972eb77a22fb62ce654ca5))
+* Fix links between docs pages ([#697](https://github.com/DS4SD/docling/issues/697)) ([`c49b352`](https://github.com/DS4SD/docling/commit/c49b3526fb7b72e8007f785b1fcfdf58c2457756))
+* Fix correct Accelerator pipeline options in docs/examples/custom_convert.py ([#733](https://github.com/DS4SD/docling/issues/733)) ([`7686083`](https://github.com/DS4SD/docling/commit/768608351d40376c3504546f52e967195536b3d5))
+* Example to translate documents ([#739](https://github.com/DS4SD/docling/issues/739)) ([`f7e1cbf`](https://github.com/DS4SD/docling/commit/f7e1cbf629ae5f3e279296e72f656b7a453ab7a3))
+
 ## [v2.15.1](https://github.com/DS4SD/docling/releases/tag/v2.15.1) - 2025-01-10
 
 ### Fix

diff --git a/docling/backend/abstract_backend.py b/docling/backend/abstract_backend.py
@@ -27,7 +27,6 @@ def is_valid(self) -> bool:
     def supports_pagination(cls) -> bool:
         pass
 
-    @abstractmethod
     def unload(self):
         if isinstance(self.path_or_stream, BytesIO):
             self.path_or_stream.close()

diff --git a/docling/backend/asciidoc_backend.py b/docling/backend/asciidoc_backend.py
@@ -24,7 +24,6 @@
 
 
 class AsciiDocBackend(DeclarativeDocumentBackend):
-
     def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
         super().__init__(in_doc, path_or_stream)
 

diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py
@@ -215,7 +215,7 @@ def handle_code(self, element, idx, doc):
         label = DocItemLabel.CODE
         if len(text) == 0:
             return
-        doc.add_text(parent=self.parents[self.level], label=label, text=text)
+        doc.add_code(parent=self.parents[self.level], label=label, text=text)
 
     def handle_paragraph(self, element, idx, doc):
         """Handles paragraph tags (p)."""

diff --git a/docling/backend/json/__init__.py b/docling/backend/json/__init__.py
diff --git a/docling/backend/json/docling_json_backend.py b/docling/backend/json/docling_json_backend.py
@@ -0,0 +1,58 @@
+from io import BytesIO
+from pathlib import Path
+from typing import Union
+
+from docling_core.types.doc import DoclingDocument
+from typing_extensions import override
+
+from docling.backend.abstract_backend import DeclarativeDocumentBackend
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import InputDocument
+
+
+class DoclingJSONBackend(DeclarativeDocumentBackend):
+    @override
+    def __init__(
+        self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]
+    ) -> None:
+        super().__init__(in_doc, path_or_stream)
+
+        # given we need to store any actual conversion exception for raising it from
+        # convert(), this captures the successful result or the actual error in a
+        # mutually exclusive way:
+        self._doc_or_err = self._get_doc_or_err()
+
+    @override
+    def is_valid(self) -> bool:
+        return isinstance(self._doc_or_err, DoclingDocument)
+
+    @classmethod
+    @override
+    def supports_pagination(cls) -> bool:
+        return False
+
+    @classmethod
+    @override
+    def supported_formats(cls) -> set[InputFormat]:
+        return {InputFormat.JSON_DOCLING}
+
+    def _get_doc_or_err(self) -> Union[DoclingDocument, Exception]:
+        try:
+            json_data: Union[str, bytes]
+            if isinstance(self.path_or_stream, Path):
+                with open(self.path_or_stream, encoding="utf-8") as f:
+                    json_data = f.read()
+            elif isinstance(self.path_or_stream, BytesIO):
+                json_data = self.path_or_stream.getvalue()
+            else:
+                raise RuntimeError(f"Unexpected: {type(self.path_or_stream)=}")
+            return DoclingDocument.model_validate_json(json_data=json_data)
+        except Exception as e:
+            return e
+
+    @override
+    def convert(self) -> DoclingDocument:
+        if isinstance(self._doc_or_err, DoclingDocument):
+            return self._doc_or_err
+        else:
+            raise self._doc_or_err
diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py
@@ -3,19 +3,22 @@
 import warnings
 from io import BytesIO
 from pathlib import Path
-from typing import Set, Union
+from typing import List, Optional, Set, Union
 
 import marko
 import marko.ext
 import marko.ext.gfm
 import marko.inline
 from docling_core.types.doc import (
+    DocItem,
     DocItemLabel,
     DoclingDocument,
     DocumentOrigin,
     GroupLabel,
+    NodeItem,
     TableCell,
     TableData,
+    TextItem,
 )
 from marko import Markdown
 
@@ -27,8 +30,7 @@
 
 
 class MarkdownDocumentBackend(DeclarativeDocumentBackend):
-
-    def shorten_underscore_sequences(self, markdown_text, max_length=10):
+    def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
         # This regex will match any sequence of underscores
         pattern = r"_+"
 
@@ -90,13 +92,13 @@ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
             ) from e
         return
 
-    def close_table(self, doc=None):
+    def close_table(self, doc: DoclingDocument):
         if self.in_table:
             _log.debug("=== TABLE START ===")
             for md_table_row in self.md_table_buffer:
                 _log.debug(md_table_row)
             _log.debug("=== TABLE END ===")
-            tcells = []
+            tcells: List[TableCell] = []
             result_table = []
             for n, md_table_row in enumerate(self.md_table_buffer):
                 data = []
@@ -137,15 +139,19 @@ def close_table(self, doc=None):
             self.in_table = False
             self.md_table_buffer = []  # clean table markdown buffer
             # Initialize Docling TableData
-            data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=tcells)
+            table_data = TableData(
+                num_rows=num_rows, num_cols=num_cols, table_cells=tcells
+            )
             # Populate
             for tcell in tcells:
-                data.table_cells.append(tcell)
+                table_data.table_cells.append(tcell)
             if len(tcells) > 0:
-                doc.add_table(data=data)
+                doc.add_table(data=table_data)
         return
 
-    def process_inline_text(self, parent_element, doc=None):
+    def process_inline_text(
+        self, parent_element: Optional[NodeItem], doc: DoclingDocument
+    ):
         # self.inline_text_buffer += str(text_in)
         txt = self.inline_text_buffer.strip()
         if len(txt) > 0:
@@ -156,14 +162,20 @@ def process_inline_text(self, parent_element, doc=None):
             )
         self.inline_text_buffer = ""
 
-    def iterate_elements(self, element, depth=0, doc=None, parent_element=None):
+    def iterate_elements(
+        self,
+        element: marko.block.Element,
+        depth: int,
+        doc: DoclingDocument,
+        parent_element: Optional[NodeItem] = None,
+    ):
         # Iterates over all elements in the AST
         # Check for different element types and process relevant details
         if isinstance(element, marko.block.Heading):
             self.close_table(doc)
             self.process_inline_text(parent_element, doc)
             _log.debug(
-                f" - Heading level {element.level}, content: {element.children[0].children}"
+                f" - Heading level {element.level}, content: {element.children[0].children}"  # type: ignore
             )
             if element.level == 1:
                 doc_label = DocItemLabel.TITLE
@@ -172,10 +184,10 @@ def iterate_elements(self, element, depth=0, doc=None, parent_element=None):
 
             # Header could have arbitrary inclusion of bold, italic or emphasis,
             # hence we need to traverse the tree to get full text of a header
-            strings = []
+            strings: List[str] = []
 
             # Define a recursive function to traverse the tree
-            def traverse(node):
+            def traverse(node: marko.block.BlockElement):
                 # Check if the node has a "children" attribute
                 if hasattr(node, "children"):
                     # If "children" is a list, continue traversal
@@ -209,9 +221,13 @@ def traverse(node):
             self.process_inline_text(parent_element, doc)
             _log.debug(" - List item")
 
-            snippet_text = str(element.children[0].children[0].children)
+            snippet_text = str(element.children[0].children[0].children)  # type: ignore
             is_numbered = False
-            if parent_element.label == GroupLabel.ORDERED_LIST:
+            if (
+                parent_element is not None
+                and isinstance(parent_element, DocItem)
+                and parent_element.label == GroupLabel.ORDERED_LIST
+            ):
                 is_numbered = True
             doc.add_list_item(
                 enumerated=is_numbered, parent=parent_element, text=snippet_text
@@ -221,7 +237,14 @@ def traverse(node):
             self.close_table(doc)
             self.process_inline_text(parent_element, doc)
             _log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
-            doc.add_picture(parent=parent_element, caption=element.title)
+
+            fig_caption: Optional[TextItem] = None
+            if element.title is not None and element.title != "":
+                fig_caption = doc.add_text(
+                    label=DocItemLabel.CAPTION, text=element.title
+                )
+
+            doc.add_picture(parent=parent_element, caption=fig_caption)
 
         elif isinstance(element, marko.block.Paragraph):
             self.process_inline_text(parent_element, doc)
@@ -252,27 +275,21 @@ def traverse(node):
             self.process_inline_text(parent_element, doc)
             _log.debug(f" - Code Span: {element.children}")
             snippet_text = str(element.children).strip()
-            doc.add_text(
-                label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
-            )
+            doc.add_code(parent=parent_element, text=snippet_text)
 
         elif isinstance(element, marko.block.CodeBlock):
             self.close_table(doc)
             self.process_inline_text(parent_element, doc)
             _log.debug(f" - Code Block: {element.children}")
-            snippet_text = str(element.children[0].children).strip()
-            doc.add_text(
-                label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
-            )
+            snippet_text = str(element.children[0].children).strip()  # type: ignore
+            doc.add_code(parent=parent_element, text=snippet_text)
 
         elif isinstance(element, marko.block.FencedCode):
             self.close_table(doc)
             self.process_inline_text(parent_element, doc)
             _log.debug(f" - Code Block: {element.children}")
-            snippet_text = str(element.children[0].children).strip()
-            doc.add_text(
-                label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
-            )
+            snippet_text = str(element.children[0].children).strip()  # type: ignore
+            doc.add_code(parent=parent_element, text=snippet_text)
 
         elif isinstance(element, marko.inline.LineBreak):
             self.process_inline_text(parent_element, doc)

diff --git a/docling/backend/msexcel_backend.py b/docling/backend/msexcel_backend.py
@@ -26,6 +26,7 @@
 
 from typing import Any, List
 
+from PIL import Image as PILImage
 from pydantic import BaseModel
 
 
@@ -44,7 +45,6 @@ class ExcelTable(BaseModel):
 
 
 class MsExcelDocumentBackend(DeclarativeDocumentBackend):
-
     def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
         super().__init__(in_doc, path_or_stream)
 
@@ -326,49 +326,61 @@ def _find_images_in_sheet(
         self, doc: DoclingDocument, sheet: Worksheet
     ) -> DoclingDocument:
 
-        # FIXME: mypy does not agree with _images ...
-        """
-        # Iterate over images in the sheet
-        for idx, image in enumerate(sheet._images):  # Access embedded images
+        # Iterate over byte images in the sheet
+        for idx, image in enumerate(sheet._images):  # type: ignore
 
-            image_bytes = BytesIO(image.ref.blob)
-            pil_image = Image.open(image_bytes)
+            try:
+                pil_image = PILImage.open(image.ref)
 
-            doc.add_picture(
-                parent=self.parents[0],
-                image=ImageRef.from_pil(image=pil_image, dpi=72),
-                caption=None,
-            )
-        """
+                doc.add_picture(
+                    parent=self.parents[0],
+                    image=ImageRef.from_pil(image=pil_image, dpi=72),
+                    caption=None,
+                )
+            except:
+                _log.error("could not extract the image from excel sheets")
 
-        # FIXME: mypy does not agree with _charts ...
         """
-        for idx, chart in enumerate(sheet._charts):  # Access embedded charts
-            chart_path = f"chart_{idx + 1}.png"
-            _log.info(
-                f"Chart found, but dynamic rendering is required for: {chart_path}"
-            )
+        for idx, chart in enumerate(sheet._charts):  # type: ignore
+            try:
+                chart_path = f"chart_{idx + 1}.png"
+                _log.info(
+                    f"Chart found, but dynamic rendering is required for: {chart_path}"
+                )
 
-            _log.info(f"Chart {idx + 1}:")
-        
-            # Chart type
-            _log.info(f"Type: {type(chart).__name__}")
-            
-            # Title
-            if chart.title:
-                _log.info(f"Title: {chart.title}")
-            else:
-                _log.info("No title")
-            
-            # Data series
-            for series in chart.series:
-                _log.info(" => series ...")
-                _log.info(f"Data Series: {series.title}")
-                _log.info(f"Values: {series.values}")
-                _log.info(f"Categories: {series.categories}")
+                _log.info(f"Chart {idx + 1}:")
                 
-            # Position
-            # _log.info(f"Anchor Cell: {chart.anchor}")
+                # Chart type
+                # _log.info(f"Type: {type(chart).__name__}")
+                print(f"Type: {type(chart).__name__}")
+
+                # Extract series data
+                for series_idx, series in enumerate(chart.series):
+                    #_log.info(f"Series {series_idx + 1}:")
+                    print(f"Series {series_idx + 1} type: {type(series).__name__}")
+                    #print(f"x-values: {series.xVal}")
+                    #print(f"y-values: {series.yVal}")
+
+                    print(f"xval type: {type(series.xVal).__name__}")
+                    
+                    xvals = []
+                    for _ in series.xVal.numLit.pt:
+                        print(f"xval type: {type(_).__name__}")
+                        if hasattr(_, 'v'):
+                            xvals.append(_.v)
+
+                    print(f"x-values: {xvals}")
+                            
+                    yvals = []
+                    for _ in series.yVal:
+                        if hasattr(_, 'v'):
+                            yvals.append(_.v)
+                            
+                    print(f"y-values: {yvals}")                    
+                    
+            except Exception as exc:
+                print(exc)
+                continue
         """
 
         return doc
Original file line number	Diff line number	Diff line change
Expand Up		@@ -24,7 +24,6 @@


		class AsciiDocBackend(DeclarativeDocumentBackend):

		def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
		super().__init__(in_doc, path_or_stream)

Expand Down