diff --git a/docling_parse/document.py b/docling_parse/document.py index 37cfd983..cfe5ab9c 100644 --- a/docling_parse/document.py +++ b/docling_parse/document.py @@ -3,19 +3,35 @@ import logging import math from enum import Enum -from typing import Dict, Iterator, List, Optional, Tuple, Union +from typing import Dict, Iterator, List, Optional, Tuple, Union, Annotated, NamedTuple from docling_core.types.doc.base import BoundingBox, CoordOrigin from PIL import Image as PILImage from PIL import ImageColor, ImageDraw, ImageFont from PIL.ImageFont import FreeTypeFont -from pydantic import AnyUrl, BaseModel +from pydantic import AnyUrl, BaseModel, Field # Configure logging logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" ) +ColorChannelValue = Annotated[int, Field(ge=0, le=255)] + +class ColorRGBA(BaseModel): + r: ColorChannelValue + g: ColorChannelValue + b: ColorChannelValue + a: ColorChannelValue = 255 + + def as_tuple(self) -> tuple[int, int, int, int]: + return (self.r, self.g, self.b, self.a) + + def __iter__(self): + yield from (self.r, self.g, self.b, self.a) +class Coord2D(NamedTuple): + x: float + y: float class BoundingRectangle(BaseModel): @@ -59,6 +75,7 @@ def angle(self): return -3.142592 / 2.0 def to_bounding_box(self) -> BoundingBox: + # FIXME: This code looks dangerous in assuming x0,y0 is bottom-left most and x2,y2 is top-right most... return BoundingBox( l=self.r_x0, b=self.r_y0, @@ -117,8 +134,10 @@ def to_top_left_origin(self, page_height: float) -> "BoundingRectangle": coord_origin=CoordOrigin.TOPLEFT, ) +class PdfBaseElement(BaseModel): + ordering: int -class PageCell(BaseModel): +class PdfCell(PdfBaseElement): rect: BoundingRectangle @@ -133,21 +152,18 @@ class PageCell(BaseModel): widget: bool - # FIXME: could use something more sofisticated? - rgba: Tuple[int, int, int, int] = (0, 0, 0, 255) + rgba: ColorRGBA = (0, 0, 0, 255) -class PageImage(BaseModel): +class PdfBitmapResource(PdfBaseElement): - ordering: int rect: BoundingRectangle uri: Optional[AnyUrl] -class PageLine(BaseModel): +class PdfLine(PdfBaseElement): - ordering: int - line_parent_id: int + #line_parent_id: int points: List[Tuple[float, float]] coord_origin: CoordOrigin = CoordOrigin.BOTTOMLEFT @@ -195,14 +211,14 @@ def to_top_left_origin(self, page_height: float): self.coord_origin = CoordOrigin.TOPLEFT -class PageBoundaryLabel(str, Enum): +class PageBoundaryType(str, Enum): """PageBoundaryLabel.""" - ART = "art_box" - BLEED = "bleed_box" - CROP = "crop_box" - MEDIA = "media_box" - TRIM = "trim_box" + ART_BOX = "art_box" + BLEED_BOX = "bleed_box" + CROP_BOX = "crop_box" + MEDIA_BOX = "media_box" + TRIM_BOX = "trim_box" def __str__(self): """Get string value.""" @@ -212,7 +228,7 @@ def __str__(self): class PageDimension(BaseModel): angle: float - page_boundary: PageBoundaryLabel + boundary_type: PageBoundaryType # bbox: BoundingBox rect: BoundingRectangle @@ -226,21 +242,21 @@ class PageDimension(BaseModel): @property def width(self): """width.""" - # FIXME: think about angle, page_boundary and coord_origin ... + # FIXME: think about angle, boundary_type and coord_origin ... return self.crop_bbox.width @property def height(self): """height.""" - # FIXME: think about angle, page_boundary and coord_origin ... + # FIXME: think about angle, boundary_type and coord_origin ... return self.crop_bbox.height @property def origin(self): """height.""" - # FIXME: think about angle, page_boundary and coord_origin ... + # FIXME: think about angle, boundary_type and coord_origin ... return (self.crop_bbox.l, self.crop_bbox.b) @@ -248,9 +264,9 @@ class SegmentedPage(BaseModel): dimension: PageDimension - cells: List[PageCell] - images: List[PageImage] - lines: List[PageLine] + cells: List[PdfCell] + images: List[PdfBitmapResource] + lines: List[PdfLine] def crop_text(self, bbox: BoundingBox, eps: float = 1.0): @@ -289,7 +305,7 @@ def crop_text(self, bbox: BoundingBox, eps: float = 1.0): def render( self, - page_boundary: PageBoundaryLabel = PageBoundaryLabel.CROP, # media_box + boundary_type: PageBoundaryType = PageBoundaryType.CROP_BOX, # media_box draw_cells_bbox: bool = False, draw_cells_text: bool = True, draw_cells_bl: bool = False, @@ -484,15 +500,15 @@ def _draw_text_in_bounding_bbox( return result -class ParsedPageLabel(str, Enum): - """ParsedPageLabel.""" - - ORIGINAL = "orginal" - SANITIZED = "sanitized" - - def __str__(self): - """Get string value.""" - return str(self.value) +# class ParsedPageLabel(str, Enum): +# """ParsedPageLabel.""" +# +# ORIGINAL = "orginal" +# SANITIZED = "sanitized" +# +# def __str__(self): +# """Get string value.""" +# return str(self.value) class ParsedPage(BaseModel): @@ -501,7 +517,7 @@ class ParsedPage(BaseModel): sanitized: SegmentedPage -class ParsedPaginatedDocument(BaseModel): +class ParsedPdfDocument(BaseModel): pages: Dict[int, ParsedPage] = {} diff --git a/docling_parse/pdf_parser.py b/docling_parse/pdf_parser.py index be4c22a8..50d478e9 100644 --- a/docling_parse/pdf_parser.py +++ b/docling_parse/pdf_parser.py @@ -7,19 +7,19 @@ from docling_parse.document import ( BoundingRectangle, - PageBoundaryLabel, - PageCell, + PageBoundaryType, + PdfCell, PageDimension, - PageImage, - PageLine, + PdfBitmapResource, + PdfLine, ParsedPage, - ParsedPaginatedDocument, + ParsedPdfDocument, SegmentedPage, ) from docling_parse.pdf_parsers import pdf_parser_v2 # type: ignore[import] -class pdf_parser: +class DoclingPdfParser: def __init__(self, loglevel: str = "fatal"): """ @@ -110,8 +110,8 @@ def parse( self, key: str, page_no: int = -1, - page_boundary: PageBoundaryLabel = PageBoundaryLabel.CROP, - ) -> ParsedPaginatedDocument: + page_boundary: PageBoundaryType = PageBoundaryType.CROP_BOX, + ) -> ParsedPdfDocument: """ Parse the PDF document identified by its unique key and return a JSON representation. @@ -143,7 +143,7 @@ def parse( def _to_dimension(self, dimension: dict) -> PageDimension: - page_boundary: PageBoundaryLabel = PageBoundaryLabel(dimension["page_boundary"]) + boundary_type: PageBoundaryType = PageBoundaryType(dimension["page_boundary"]) """ bbox = BoundingBox( @@ -195,6 +195,8 @@ def _to_dimension(self, dimension: dict) -> PageDimension: coord_origin=CoordOrigin.BOTTOMLEFT, ) + # Fixme: The boundary type to which this rect refers should accept a user argument + # TODO: Why is this a BoundingRectangle not a BoundingBox? rect = BoundingRectangle( r_x0=crop_bbox.l, r_y0=crop_bbox.b, @@ -209,7 +211,7 @@ def _to_dimension(self, dimension: dict) -> PageDimension: return PageDimension( angle=dimension["angle"], - page_boundary=dimension["page_boundary"], + boundary_type=boundary_type, # bbox=bbox, rect=rect, art_bbox=art_bbox, @@ -219,7 +221,7 @@ def _to_dimension(self, dimension: dict) -> PageDimension: bleed_bbox=bleed_bbox, ) - def _to_cells(self, cells: dict) -> List[PageCell]: + def _to_cells(self, cells: dict) -> List[PdfCell]: assert "data" in cells, '"data" in cells' assert "header" in cells, '"header" in cells' @@ -227,7 +229,7 @@ def _to_cells(self, cells: dict) -> List[PageCell]: data = cells["data"] header = cells["header"] - result: List[PageCell] = [] + result: List[PdfCell] = [] for ind, row in enumerate(data): rect = BoundingRectangle( r_x0=row[header.index(f"r_x0")], @@ -239,7 +241,7 @@ def _to_cells(self, cells: dict) -> List[PageCell]: r_x3=row[header.index(f"r_x3")], r_y3=row[header.index(f"r_y3")], ) - cell = PageCell( + cell = PdfCell( rect=rect, text=row[header.index(f"text")], orig=row[header.index(f"text")], @@ -253,7 +255,7 @@ def _to_cells(self, cells: dict) -> List[PageCell]: return result - def _to_images(self, images: dict) -> List[PageImage]: + def _to_images(self, images: dict) -> List[PdfBitmapResource]: assert "data" in images, '"data" in images' assert "header" in images, '"header" in images' @@ -261,7 +263,7 @@ def _to_images(self, images: dict) -> List[PageImage]: data = images["data"] header = images["header"] - result: List[PageImage] = [] + result: List[PdfBitmapResource] = [] for ind, row in enumerate(data): rect = BoundingRectangle( r_x0=row[header.index(f"x0")], @@ -273,14 +275,14 @@ def _to_images(self, images: dict) -> List[PageImage]: r_x3=row[header.index(f"x0")], r_y3=row[header.index(f"y1")], ) - image = PageImage(ordering=ind, rect=rect, uri=None) + image = PdfBitmapResource(ordering=ind, rect=rect, uri=None) result.append(image) return result - def _to_lines(self, data: dict) -> List[PageLine]: + def _to_lines(self, data: dict) -> List[PdfLine]: - result: List[PageLine] = [] + result: List[PdfLine] = [] for ind, item in enumerate(data): for l in range(0, len(item["i"]), 2): @@ -291,7 +293,7 @@ def _to_lines(self, data: dict) -> List[PageLine]: for k in range(i0, i1): points.append((item["x"][k], item["y"][k])) - line = PageLine(ordering=ind, line_parent_id=l, points=points) + line = PdfLine(ordering=ind, line_parent_id=l, points=points) result.append(line) return result @@ -314,9 +316,9 @@ def _to_parsed_page(self, page: dict) -> ParsedPage: def _to_parsed_paginated_document( self, doc_dict: dict, page_no: int = 1 - ) -> ParsedPaginatedDocument: + ) -> ParsedPdfDocument: - parsed_doc = ParsedPaginatedDocument() + parsed_doc = ParsedPdfDocument() for pi, page in enumerate(doc_dict["pages"]): parsed_doc.pages[page_no + pi] = self._to_parsed_page(page) diff --git a/tests/test_parse.py b/tests/test_parse.py index b9e73184..a0515470 100644 --- a/tests/test_parse.py +++ b/tests/test_parse.py @@ -2,14 +2,14 @@ import glob -from docling_parse.pdf_parser import pdf_parser +from docling_parse.pdf_parser import DoclingPdfParser REGRESSION_FOLDER = "tests/data/regression/*.pdf" def test_reference_documents_from_filenames_with_keys(): - parser = pdf_parser(loglevel="fatal") + parser = DoclingPdfParser(loglevel="fatal") pdf_docs = glob.glob(REGRESSION_FOLDER)