Refactor and renaming high-level APIs (WIP)

Signed-off-by: Christoph Auer <[email protected]>
DS4SD · Jan 14, 2025 · 6cbd931 · 6cbd931
1 parent 833c528
commit 6cbd931
Show file tree

Hide file tree

Showing 3 changed files with 75 additions and 57 deletions.
diff --git a/docling_parse/document.py b/docling_parse/document.py
@@ -3,19 +3,35 @@
 import logging
 import math
 from enum import Enum
-from typing import Dict, Iterator, List, Optional, Tuple, Union
+from typing import Dict, Iterator, List, Optional, Tuple, Union, Annotated, NamedTuple
 
 from docling_core.types.doc.base import BoundingBox, CoordOrigin
 from PIL import Image as PILImage
 from PIL import ImageColor, ImageDraw, ImageFont
 from PIL.ImageFont import FreeTypeFont
-from pydantic import AnyUrl, BaseModel
+from pydantic import AnyUrl, BaseModel, Field
 
 # Configure logging
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
 )
 
+ColorChannelValue = Annotated[int, Field(ge=0, le=255)]
+
+class ColorRGBA(BaseModel):
+    r: ColorChannelValue
+    g: ColorChannelValue
+    b: ColorChannelValue
+    a: ColorChannelValue = 255
+
+    def as_tuple(self) -> tuple[int, int, int, int]:
+        return (self.r, self.g, self.b, self.a)
+
+    def __iter__(self):
+        yield from (self.r, self.g, self.b, self.a)
+class Coord2D(NamedTuple):
+    x: float
+    y: float
 
 class BoundingRectangle(BaseModel):
 
@@ -59,6 +75,7 @@ def angle(self):
             return -3.142592 / 2.0
 
     def to_bounding_box(self) -> BoundingBox:
+        # FIXME: This code looks dangerous in assuming x0,y0 is bottom-left most and x2,y2 is top-right most...
         return BoundingBox(
             l=self.r_x0,
             b=self.r_y0,
@@ -117,8 +134,10 @@ def to_top_left_origin(self, page_height: float) -> "BoundingRectangle":
                 coord_origin=CoordOrigin.TOPLEFT,
             )
 
+class PdfBaseElement(BaseModel):
+    ordering: int
 
-class PageCell(BaseModel):
+class PdfCell(PdfBaseElement):
 
     rect: BoundingRectangle
 
@@ -133,21 +152,18 @@ class PageCell(BaseModel):
 
     widget: bool
 
-    # FIXME: could use something more sofisticated?
-    rgba: Tuple[int, int, int, int] = (0, 0, 0, 255)
+    rgba: ColorRGBA = (0, 0, 0, 255)
 
 
-class PageImage(BaseModel):
+class PdfBitmapResource(PdfBaseElement):
 
-    ordering: int
     rect: BoundingRectangle
     uri: Optional[AnyUrl]
 
 
-class PageLine(BaseModel):
+class PdfLine(PdfBaseElement):
 
-    ordering: int
-    line_parent_id: int
+    #line_parent_id: int
     points: List[Tuple[float, float]]
 
     coord_origin: CoordOrigin = CoordOrigin.BOTTOMLEFT
@@ -195,14 +211,14 @@ def to_top_left_origin(self, page_height: float):
             self.coord_origin = CoordOrigin.TOPLEFT
 
 
-class PageBoundaryLabel(str, Enum):
+class PageBoundaryType(str, Enum):
     """PageBoundaryLabel."""
 
-    ART = "art_box"
-    BLEED = "bleed_box"
-    CROP = "crop_box"
-    MEDIA = "media_box"
-    TRIM = "trim_box"
+    ART_BOX = "art_box"
+    BLEED_BOX = "bleed_box"
+    CROP_BOX = "crop_box"
+    MEDIA_BOX = "media_box"
+    TRIM_BOX = "trim_box"
 
     def __str__(self):
         """Get string value."""
@@ -212,7 +228,7 @@ def __str__(self):
 class PageDimension(BaseModel):
 
     angle: float
-    page_boundary: PageBoundaryLabel
+    boundary_type: PageBoundaryType
 
     # bbox: BoundingBox
     rect: BoundingRectangle
@@ -226,31 +242,31 @@ class PageDimension(BaseModel):
     @property
     def width(self):
         """width."""
-        # FIXME: think about angle, page_boundary and coord_origin ...
+        # FIXME: think about angle, boundary_type and coord_origin ...
         return self.crop_bbox.width
 
     @property
     def height(self):
         """height."""
 
-        # FIXME: think about angle, page_boundary and coord_origin ...
+        # FIXME: think about angle, boundary_type and coord_origin ...
         return self.crop_bbox.height
 
     @property
     def origin(self):
         """height."""
 
-        # FIXME: think about angle, page_boundary and coord_origin ...
+        # FIXME: think about angle, boundary_type and coord_origin ...
         return (self.crop_bbox.l, self.crop_bbox.b)
 
 
 class SegmentedPage(BaseModel):
 
     dimension: PageDimension
 
-    cells: List[PageCell]
-    images: List[PageImage]
-    lines: List[PageLine]
+    cells: List[PdfCell]
+    images: List[PdfBitmapResource]
+    lines: List[PdfLine]
 
     def crop_text(self, bbox: BoundingBox, eps: float = 1.0):
 
@@ -289,7 +305,7 @@ def crop_text(self, bbox: BoundingBox, eps: float = 1.0):
 
     def render(
         self,
-        page_boundary: PageBoundaryLabel = PageBoundaryLabel.CROP,  # media_box
+        boundary_type: PageBoundaryType = PageBoundaryType.CROP_BOX,  # media_box
         draw_cells_bbox: bool = False,
         draw_cells_text: bool = True,
         draw_cells_bl: bool = False,
@@ -484,15 +500,15 @@ def _draw_text_in_bounding_bbox(
         return result
 
 
-class ParsedPageLabel(str, Enum):
-    """ParsedPageLabel."""
-
-    ORIGINAL = "orginal"
-    SANITIZED = "sanitized"
-
-    def __str__(self):
-        """Get string value."""
-        return str(self.value)
+# class ParsedPageLabel(str, Enum):
+#     """ParsedPageLabel."""
+#
+#     ORIGINAL = "orginal"
+#     SANITIZED = "sanitized"
+#
+#     def __str__(self):
+#         """Get string value."""
+#         return str(self.value)
 
 
 class ParsedPage(BaseModel):
@@ -501,7 +517,7 @@ class ParsedPage(BaseModel):
     sanitized: SegmentedPage
 
 
-class ParsedPaginatedDocument(BaseModel):
+class ParsedPdfDocument(BaseModel):
 
     pages: Dict[int, ParsedPage] = {}
 

diff --git a/docling_parse/pdf_parser.py b/docling_parse/pdf_parser.py
@@ -7,19 +7,19 @@
 
 from docling_parse.document import (
     BoundingRectangle,
-    PageBoundaryLabel,
-    PageCell,
+    PageBoundaryType,
+    PdfCell,
     PageDimension,
-    PageImage,
-    PageLine,
+    PdfBitmapResource,
+    PdfLine,
     ParsedPage,
-    ParsedPaginatedDocument,
+    ParsedPdfDocument,
     SegmentedPage,
 )
 from docling_parse.pdf_parsers import pdf_parser_v2  # type: ignore[import]
 
 
-class pdf_parser:
+class DoclingPdfParser:
 
     def __init__(self, loglevel: str = "fatal"):
         """
@@ -110,8 +110,8 @@ def parse(
         self,
         key: str,
         page_no: int = -1,
-        page_boundary: PageBoundaryLabel = PageBoundaryLabel.CROP,
-    ) -> ParsedPaginatedDocument:
+        page_boundary: PageBoundaryType = PageBoundaryType.CROP_BOX,
+    ) -> ParsedPdfDocument:
         """
         Parse the PDF document identified by its unique key and return a JSON representation.
 
@@ -143,7 +143,7 @@ def parse(
 
     def _to_dimension(self, dimension: dict) -> PageDimension:
 
-        page_boundary: PageBoundaryLabel = PageBoundaryLabel(dimension["page_boundary"])
+        boundary_type: PageBoundaryType = PageBoundaryType(dimension["page_boundary"])
 
         """
         bbox = BoundingBox(
@@ -195,6 +195,8 @@ def _to_dimension(self, dimension: dict) -> PageDimension:
             coord_origin=CoordOrigin.BOTTOMLEFT,
         )
 
+        # Fixme: The boundary type to which this rect refers should accept a user argument
+        # TODO: Why is this a BoundingRectangle not a BoundingBox?
         rect = BoundingRectangle(
             r_x0=crop_bbox.l,
             r_y0=crop_bbox.b,
@@ -209,7 +211,7 @@ def _to_dimension(self, dimension: dict) -> PageDimension:
 
         return PageDimension(
             angle=dimension["angle"],
-            page_boundary=dimension["page_boundary"],
+            boundary_type=boundary_type,
             # bbox=bbox,
             rect=rect,
             art_bbox=art_bbox,
@@ -219,15 +221,15 @@ def _to_dimension(self, dimension: dict) -> PageDimension:
             bleed_bbox=bleed_bbox,
         )
 
-    def _to_cells(self, cells: dict) -> List[PageCell]:
+    def _to_cells(self, cells: dict) -> List[PdfCell]:
 
         assert "data" in cells, '"data" in cells'
         assert "header" in cells, '"header" in cells'
 
         data = cells["data"]
         header = cells["header"]
 
-        result: List[PageCell] = []
+        result: List[PdfCell] = []
         for ind, row in enumerate(data):
             rect = BoundingRectangle(
                 r_x0=row[header.index(f"r_x0")],
@@ -239,7 +241,7 @@ def _to_cells(self, cells: dict) -> List[PageCell]:
                 r_x3=row[header.index(f"r_x3")],
                 r_y3=row[header.index(f"r_y3")],
             )
-            cell = PageCell(
+            cell = PdfCell(
                 rect=rect,
                 text=row[header.index(f"text")],
                 orig=row[header.index(f"text")],
@@ -253,15 +255,15 @@ def _to_cells(self, cells: dict) -> List[PageCell]:
 
         return result
 
-    def _to_images(self, images: dict) -> List[PageImage]:
+    def _to_images(self, images: dict) -> List[PdfBitmapResource]:
 
         assert "data" in images, '"data" in images'
         assert "header" in images, '"header" in images'
 
         data = images["data"]
         header = images["header"]
 
-        result: List[PageImage] = []
+        result: List[PdfBitmapResource] = []
         for ind, row in enumerate(data):
             rect = BoundingRectangle(
                 r_x0=row[header.index(f"x0")],
@@ -273,14 +275,14 @@ def _to_images(self, images: dict) -> List[PageImage]:
                 r_x3=row[header.index(f"x0")],
                 r_y3=row[header.index(f"y1")],
             )
-            image = PageImage(ordering=ind, rect=rect, uri=None)
+            image = PdfBitmapResource(ordering=ind, rect=rect, uri=None)
             result.append(image)
 
         return result
 
-    def _to_lines(self, data: dict) -> List[PageLine]:
+    def _to_lines(self, data: dict) -> List[PdfLine]:
 
-        result: List[PageLine] = []
+        result: List[PdfLine] = []
         for ind, item in enumerate(data):
 
             for l in range(0, len(item["i"]), 2):
@@ -291,7 +293,7 @@ def _to_lines(self, data: dict) -> List[PageLine]:
                 for k in range(i0, i1):
                     points.append((item["x"][k], item["y"][k]))
 
-                line = PageLine(ordering=ind, line_parent_id=l, points=points)
+                line = PdfLine(ordering=ind, line_parent_id=l, points=points)
                 result.append(line)
 
         return result
@@ -314,9 +316,9 @@ def _to_parsed_page(self, page: dict) -> ParsedPage:
 
     def _to_parsed_paginated_document(
         self, doc_dict: dict, page_no: int = 1
-    ) -> ParsedPaginatedDocument:
+    ) -> ParsedPdfDocument:
 
-        parsed_doc = ParsedPaginatedDocument()
+        parsed_doc = ParsedPdfDocument()
 
         for pi, page in enumerate(doc_dict["pages"]):
             parsed_doc.pages[page_no + pi] = self._to_parsed_page(page)

diff --git a/tests/test_parse.py b/tests/test_parse.py
@@ -2,14 +2,14 @@
 
 import glob
 
-from docling_parse.pdf_parser import pdf_parser
+from docling_parse.pdf_parser import DoclingPdfParser
 
 REGRESSION_FOLDER = "tests/data/regression/*.pdf"
 
 
 def test_reference_documents_from_filenames_with_keys():
 
-    parser = pdf_parser(loglevel="fatal")
+    parser = DoclingPdfParser(loglevel="fatal")
 
     pdf_docs = glob.glob(REGRESSION_FOLDER)