Establish high-level DoclingPdfParser and PdfDocument APIs

Signed-off-by: Christoph Auer <[email protected]>
DS4SD · Jan 15, 2025 · 2950c3f · 2950c3f
1 parent 3aea044
commit 2950c3f
Showing 3 changed files with 62 additions and 44 deletions.
diff --git a/docling_parse/document.py b/docling_parse/document.py
@@ -3,7 +3,7 @@
 import logging
 import math
 from enum import Enum
-from typing import Dict, Iterator, List, Optional, Tuple, Union, Annotated, NamedTuple
+from typing import Annotated, Dict, Iterator, List, NamedTuple, Optional, Tuple, Union
 
 from docling_core.types.doc.base import BoundingBox, CoordOrigin
 from PIL import Image as PILImage
@@ -18,6 +18,7 @@
 
 ColorChannelValue = Annotated[int, Field(ge=0, le=255)]
 
+
 class ColorRGBA(BaseModel):
     r: ColorChannelValue
     g: ColorChannelValue
@@ -29,10 +30,13 @@ def as_tuple(self) -> tuple[int, int, int, int]:
 
     def __iter__(self):
         yield from (self.r, self.g, self.b, self.a)
+
+
 class Coord2D(NamedTuple):
     x: float
     y: float
 
+
 class BoundingRectangle(BaseModel):
 
     r_x0: float
@@ -134,9 +138,11 @@ def to_top_left_origin(self, page_height: float) -> "BoundingRectangle":
                 coord_origin=CoordOrigin.TOPLEFT,
             )
 
+
 class PdfBaseElement(BaseModel):
     ordering: int
 
+
 class PdfCell(PdfBaseElement):
 
     rect: BoundingRectangle
@@ -152,7 +158,7 @@ class PdfCell(PdfBaseElement):
 
     widget: bool
 
-    rgba: ColorRGBA = (0, 0, 0, 255)
+    rgba: ColorRGBA = ColorRGBA(r=0, g=0, b=0, a=255)
 
 
 class PdfBitmapResource(PdfBaseElement):
@@ -163,8 +169,9 @@ class PdfBitmapResource(PdfBaseElement):
 
 class PdfLine(PdfBaseElement):
 
-    #line_parent_id: int
+    # line_parent_id: int
     points: List[Tuple[float, float]]
+    # line_parent_id: int
 
     coord_origin: CoordOrigin = CoordOrigin.BOTTOMLEFT
 
@@ -519,7 +526,6 @@ class ParsedPage(BaseModel):
 
 class ParsedPdfDocument(BaseModel):
 
-
     pages: Dict[int, ParsedPage] = {}
 
     def iterate_pages(

diff --git a/docling_parse/pdf_parser.py b/docling_parse/pdf_parser.py
@@ -1,21 +1,21 @@
 """Parser for PDF files"""
+
 import hashlib
 from io import BytesIO
 from pathlib import Path
-from typing import List, Tuple, Union, Dict, Iterator
+from typing import Dict, Iterator, List, Tuple, Union
 
 from docling_core.types.doc.base import BoundingBox, CoordOrigin
-from pydantic import BaseModel, TypeAdapter
 
 from docling_parse.document import (
     BoundingRectangle,
     PageBoundaryType,
-    PdfCell,
     PageDimension,
-    PdfBitmapResource,
-    PdfLine,
     ParsedPage,
     ParsedPdfDocument,
+    PdfBitmapResource,
+    PdfCell,
+    PdfLine,
     SegmentedPage,
 )
 from docling_parse.pdf_parsers import pdf_parser_v2  # type: ignore[import]
@@ -29,7 +29,12 @@ def iterate_pages(
         for page_no in range(self.number_of_pages()):
             yield page_no + 1, self.get_page(page_no + 1)
 
-    def __init__(self, parser: "pdf_parser_v2", key: str, boundary_type: PageBoundaryType = PageBoundaryType.CROP_BOX):
+    def __init__(
+        self,
+        parser: "pdf_parser_v2",
+        key: str,
+        boundary_type: PageBoundaryType = PageBoundaryType.CROP_BOX,
+    ):
         self._parser: pdf_parser_v2 = parser
         self._key = key
         self._boundary_type = boundary_type
@@ -39,9 +44,12 @@ def is_loaded(self) -> bool:
         return self._parser.is_loaded(key=self._key)
 
     def unload(self) -> bool:
+        self._pages.clear()
+
         if self.is_loaded():
             return self._parser.unload_document(self._key)
-        self._pages.clear()
+        else:
+            return False
 
     def number_of_pages(self) -> int:
         if self.is_loaded():
@@ -57,8 +65,10 @@ def get_page(self, page_no: int):
                 doc_dict = self._parser.parse_pdf_from_key_on_page(
                     key=self._key, page=page_no - 1, page_boundary=self._boundary_type
                 )
-                for pi, page in enumerate(doc_dict["pages"]): # only one page is expected
-                    self._pages[page_no] = self._to_parsed_page(page) # put on cache
+                for pi, page in enumerate(
+                    doc_dict["pages"]
+                ):  # only one page is expected
+                    self._pages[page_no] = self._to_parsed_page(page)  # put on cache
                     return self._pages[page_no]
 
             else:
@@ -225,7 +235,11 @@ def _to_lines(self, data: dict) -> List[PdfLine]:
                 for k in range(i0, i1):
                     points.append((item["x"][k], item["y"][k]))
 
-                line = PdfLine(ordering=ind, line_parent_id=l, points=points)
+                line = PdfLine(
+                    ordering=ind,
+                    points=points,
+                    # line_parent_id=l,
+                )
                 result.append(line)
 
         return result
@@ -280,7 +294,6 @@ def set_loglevel(self, loglevel: str):
         """
         self.parser.set_loglevel_with_label(level=loglevel)
 
-
     def list_loaded_keys(self) -> List[str]:
         """List the keys of the loaded documents.
 
@@ -289,16 +302,20 @@ def list_loaded_keys(self) -> List[str]:
         """
         return self.parser.list_loaded_keys()
 
-    def load(self, path_or_stream: Union[str, Path, BytesIO], lazy: bool = True, boundary_type: PageBoundaryType = PageBoundaryType.CROP_BOX) -> PdfDocument:
-        #success: bool
-        #key: str
+    def load(
+        self,
+        path_or_stream: Union[str, Path, BytesIO],
+        lazy: bool = True,
+        boundary_type: PageBoundaryType = PageBoundaryType.CROP_BOX,
+    ) -> PdfDocument:
+        # success: bool
+        # key: str
 
         if isinstance(path_or_stream, str):
             path_or_stream = Path(path_or_stream)
 
-
         if isinstance(path_or_stream, Path):
-            key = f"key={str(path_or_stream)}" # use filepath as internal handle
+            key = f"key={str(path_or_stream)}"  # use filepath as internal handle
             success = self._load_document(key=key, filename=str(path_or_stream))
 
         elif isinstance(path_or_stream, BytesIO):
@@ -309,19 +326,20 @@ def load(self, path_or_stream: Union[str, Path, BytesIO], lazy: bool = True, bou
             path_or_stream.seek(0)
             hash = hasher.hexdigest()
 
-            key = f"key={hash}" # use md5 hash as internal handle
+            key = f"key={hash}"  # use md5 hash as internal handle
             success = self._load_document_from_bytesio(key=key, data=path_or_stream)
 
         if success:
-            result_doc = PdfDocument(parser=self.parser, key=key, boundary_type=boundary_type)
-            if not lazy: # eagerly parse the pages at init time if desired
+            result_doc = PdfDocument(
+                parser=self.parser, key=key, boundary_type=boundary_type
+            )
+            if not lazy:  # eagerly parse the pages at init time if desired
                 result_doc.load_all_pages()
 
             return result_doc
         else:
             raise RuntimeError(f"Failed to load document with key {key}")
 
-
     def _load_document(self, key: str, filename: str) -> bool:
         """Load a document by key and filename.
 
@@ -345,5 +363,3 @@ def _load_document_from_bytesio(self, key: str, data: BytesIO) -> bool:
              bool: True if the document was successfully loaded, False otherwise.)")
         """
         return self.parser.load_document(key=key, bytes_io=data)
-
-
diff --git a/tests/test_parse.py b/tests/test_parse.py
@@ -22,29 +22,30 @@ def test_reference_documents_from_filenames():
     for pdf_doc_path in pdf_docs:
         print(pdf_doc_path)
 
-        pdf_doc: PdfDocument = parser.load(pagth_or_stream=pdf_doc_path,
-                                           boundary_type=PageBoundaryType.CROP_BOX, # default: CROP_BOX
-                                           lazy=False) # default: True
+        pdf_doc: PdfDocument = parser.load(
+            pagth_or_stream=pdf_doc_path,
+            boundary_type=PageBoundaryType.CROP_BOX,  # default: CROP_BOX
+            lazy=False,
+        )  # default: True
         assert pdf_doc is not None
 
         for page_no, page in pdf_doc.iterate_pages():
             print(" -> Page ", page_no, end=" ")
             print("has ", len(page.sanitized.cells), "cells.")
-            #res = page.original.render()
-            #res.show()
+            # res = page.original.render()
+            # res.show()
 
     assert True
 
+
 def test_load_lazy_or_eager():
     filename = "tests/data/regression/table_of_contents_01.pdf"
 
     parser = DoclingPdfParser(loglevel="fatal")
 
-    pdf_doc_case1: PdfDocument = parser.load(path_or_stream=filename,
-                                       lazy=True)
+    pdf_doc_case1: PdfDocument = parser.load(path_or_stream=filename, lazy=True)
 
-    pdf_doc_case2: PdfDocument = parser.load(path_or_stream=filename,
-                                       lazy=False)
+    pdf_doc_case2: PdfDocument = parser.load(path_or_stream=filename, lazy=False)
 
     # The lazy doc has no pages populated, the eager one has them.
     assert pdf_doc_case1._pages != pdf_doc_case2._pages
@@ -61,13 +62,9 @@ def test_load_two_distinct_docs():
 
     parser = DoclingPdfParser(loglevel="fatal")
 
-    pdf_doc_case1: PdfDocument = parser.load(path_or_stream=filename1,
-                                       lazy=True)
-
-
-    pdf_doc_case2: PdfDocument = parser.load(path_or_stream=filename2,
-                                       lazy=True)
+    pdf_doc_case1: PdfDocument = parser.load(path_or_stream=filename1, lazy=True)
 
+    pdf_doc_case2: PdfDocument = parser.load(path_or_stream=filename2, lazy=True)
 
     assert pdf_doc_case1.number_of_pages() != pdf_doc_case2.number_of_pages()
 
@@ -76,13 +73,13 @@ def test_load_two_distinct_docs():
 
     assert pdf_doc_case1._pages != pdf_doc_case2._pages
 
+
 def test_serialize_and_reload():
     filename = "tests/data/regression/table_of_contents_01.pdf"
 
     parser = DoclingPdfParser(loglevel="fatal")
 
-    pdf_doc: PdfDocument = parser.load(path_or_stream=filename,
-                                             lazy=True)
+    pdf_doc: PdfDocument = parser.load(path_or_stream=filename, lazy=True)
 
     # TODO a proper serialization model must be still established for a full PdfDocument
 
@@ -92,4 +89,3 @@ def test_serialize_and_reload():
     reloaded_pages: Dict[int, ParsedPage] = page_adapter.validate_json(json_pages)
 
     assert reloaded_pages == pdf_doc._pages
-