Skip to content

Commit

Permalink
Refactor and renaming high-level APIs (WIP)
Browse files Browse the repository at this point in the history
Signed-off-by: Christoph Auer <[email protected]>
  • Loading branch information
cau-git committed Jan 14, 2025
1 parent 833c528 commit 6cbd931
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 57 deletions.
84 changes: 50 additions & 34 deletions docling_parse/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,35 @@
import logging
import math
from enum import Enum
from typing import Dict, Iterator, List, Optional, Tuple, Union
from typing import Dict, Iterator, List, Optional, Tuple, Union, Annotated, NamedTuple

from docling_core.types.doc.base import BoundingBox, CoordOrigin
from PIL import Image as PILImage
from PIL import ImageColor, ImageDraw, ImageFont
from PIL.ImageFont import FreeTypeFont
from pydantic import AnyUrl, BaseModel
from pydantic import AnyUrl, BaseModel, Field

# Configure logging
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)

ColorChannelValue = Annotated[int, Field(ge=0, le=255)]

class ColorRGBA(BaseModel):
r: ColorChannelValue
g: ColorChannelValue
b: ColorChannelValue
a: ColorChannelValue = 255

def as_tuple(self) -> tuple[int, int, int, int]:
return (self.r, self.g, self.b, self.a)

def __iter__(self):
yield from (self.r, self.g, self.b, self.a)
class Coord2D(NamedTuple):
x: float
y: float

class BoundingRectangle(BaseModel):

Expand Down Expand Up @@ -59,6 +75,7 @@ def angle(self):
return -3.142592 / 2.0

def to_bounding_box(self) -> BoundingBox:
# FIXME: This code looks dangerous in assuming x0,y0 is bottom-left most and x2,y2 is top-right most...
return BoundingBox(
l=self.r_x0,
b=self.r_y0,
Expand Down Expand Up @@ -117,8 +134,10 @@ def to_top_left_origin(self, page_height: float) -> "BoundingRectangle":
coord_origin=CoordOrigin.TOPLEFT,
)

class PdfBaseElement(BaseModel):
ordering: int

class PageCell(BaseModel):
class PdfCell(PdfBaseElement):

rect: BoundingRectangle

Expand All @@ -133,21 +152,18 @@ class PageCell(BaseModel):

widget: bool

# FIXME: could use something more sofisticated?
rgba: Tuple[int, int, int, int] = (0, 0, 0, 255)
rgba: ColorRGBA = (0, 0, 0, 255)


class PageImage(BaseModel):
class PdfBitmapResource(PdfBaseElement):

ordering: int
rect: BoundingRectangle
uri: Optional[AnyUrl]


class PageLine(BaseModel):
class PdfLine(PdfBaseElement):

ordering: int
line_parent_id: int
#line_parent_id: int
points: List[Tuple[float, float]]

coord_origin: CoordOrigin = CoordOrigin.BOTTOMLEFT
Expand Down Expand Up @@ -195,14 +211,14 @@ def to_top_left_origin(self, page_height: float):
self.coord_origin = CoordOrigin.TOPLEFT


class PageBoundaryLabel(str, Enum):
class PageBoundaryType(str, Enum):
"""PageBoundaryLabel."""

ART = "art_box"
BLEED = "bleed_box"
CROP = "crop_box"
MEDIA = "media_box"
TRIM = "trim_box"
ART_BOX = "art_box"
BLEED_BOX = "bleed_box"
CROP_BOX = "crop_box"
MEDIA_BOX = "media_box"
TRIM_BOX = "trim_box"

def __str__(self):
"""Get string value."""
Expand All @@ -212,7 +228,7 @@ def __str__(self):
class PageDimension(BaseModel):

angle: float
page_boundary: PageBoundaryLabel
boundary_type: PageBoundaryType

# bbox: BoundingBox
rect: BoundingRectangle
Expand All @@ -226,31 +242,31 @@ class PageDimension(BaseModel):
@property
def width(self):
"""width."""
# FIXME: think about angle, page_boundary and coord_origin ...
# FIXME: think about angle, boundary_type and coord_origin ...
return self.crop_bbox.width

@property
def height(self):
"""height."""

# FIXME: think about angle, page_boundary and coord_origin ...
# FIXME: think about angle, boundary_type and coord_origin ...
return self.crop_bbox.height

@property
def origin(self):
"""height."""

# FIXME: think about angle, page_boundary and coord_origin ...
# FIXME: think about angle, boundary_type and coord_origin ...
return (self.crop_bbox.l, self.crop_bbox.b)


class SegmentedPage(BaseModel):

dimension: PageDimension

cells: List[PageCell]
images: List[PageImage]
lines: List[PageLine]
cells: List[PdfCell]
images: List[PdfBitmapResource]
lines: List[PdfLine]

def crop_text(self, bbox: BoundingBox, eps: float = 1.0):

Expand Down Expand Up @@ -289,7 +305,7 @@ def crop_text(self, bbox: BoundingBox, eps: float = 1.0):

def render(
self,
page_boundary: PageBoundaryLabel = PageBoundaryLabel.CROP, # media_box
boundary_type: PageBoundaryType = PageBoundaryType.CROP_BOX, # media_box
draw_cells_bbox: bool = False,
draw_cells_text: bool = True,
draw_cells_bl: bool = False,
Expand Down Expand Up @@ -484,15 +500,15 @@ def _draw_text_in_bounding_bbox(
return result


class ParsedPageLabel(str, Enum):
"""ParsedPageLabel."""

ORIGINAL = "orginal"
SANITIZED = "sanitized"

def __str__(self):
"""Get string value."""
return str(self.value)
# class ParsedPageLabel(str, Enum):
# """ParsedPageLabel."""
#
# ORIGINAL = "orginal"
# SANITIZED = "sanitized"
#
# def __str__(self):
# """Get string value."""
# return str(self.value)


class ParsedPage(BaseModel):
Expand All @@ -501,7 +517,7 @@ class ParsedPage(BaseModel):
sanitized: SegmentedPage


class ParsedPaginatedDocument(BaseModel):
class ParsedPdfDocument(BaseModel):

pages: Dict[int, ParsedPage] = {}

Expand Down
44 changes: 23 additions & 21 deletions docling_parse/pdf_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,19 @@

from docling_parse.document import (
BoundingRectangle,
PageBoundaryLabel,
PageCell,
PageBoundaryType,
PdfCell,
PageDimension,
PageImage,
PageLine,
PdfBitmapResource,
PdfLine,
ParsedPage,
ParsedPaginatedDocument,
ParsedPdfDocument,
SegmentedPage,
)
from docling_parse.pdf_parsers import pdf_parser_v2 # type: ignore[import]


class pdf_parser:
class DoclingPdfParser:

def __init__(self, loglevel: str = "fatal"):
"""
Expand Down Expand Up @@ -110,8 +110,8 @@ def parse(
self,
key: str,
page_no: int = -1,
page_boundary: PageBoundaryLabel = PageBoundaryLabel.CROP,
) -> ParsedPaginatedDocument:
page_boundary: PageBoundaryType = PageBoundaryType.CROP_BOX,
) -> ParsedPdfDocument:
"""
Parse the PDF document identified by its unique key and return a JSON representation.
Expand Down Expand Up @@ -143,7 +143,7 @@ def parse(

def _to_dimension(self, dimension: dict) -> PageDimension:

page_boundary: PageBoundaryLabel = PageBoundaryLabel(dimension["page_boundary"])
boundary_type: PageBoundaryType = PageBoundaryType(dimension["page_boundary"])

"""
bbox = BoundingBox(
Expand Down Expand Up @@ -195,6 +195,8 @@ def _to_dimension(self, dimension: dict) -> PageDimension:
coord_origin=CoordOrigin.BOTTOMLEFT,
)

# Fixme: The boundary type to which this rect refers should accept a user argument
# TODO: Why is this a BoundingRectangle not a BoundingBox?
rect = BoundingRectangle(
r_x0=crop_bbox.l,
r_y0=crop_bbox.b,
Expand All @@ -209,7 +211,7 @@ def _to_dimension(self, dimension: dict) -> PageDimension:

return PageDimension(
angle=dimension["angle"],
page_boundary=dimension["page_boundary"],
boundary_type=boundary_type,
# bbox=bbox,
rect=rect,
art_bbox=art_bbox,
Expand All @@ -219,15 +221,15 @@ def _to_dimension(self, dimension: dict) -> PageDimension:
bleed_bbox=bleed_bbox,
)

def _to_cells(self, cells: dict) -> List[PageCell]:
def _to_cells(self, cells: dict) -> List[PdfCell]:

assert "data" in cells, '"data" in cells'
assert "header" in cells, '"header" in cells'

data = cells["data"]
header = cells["header"]

result: List[PageCell] = []
result: List[PdfCell] = []
for ind, row in enumerate(data):
rect = BoundingRectangle(
r_x0=row[header.index(f"r_x0")],
Expand All @@ -239,7 +241,7 @@ def _to_cells(self, cells: dict) -> List[PageCell]:
r_x3=row[header.index(f"r_x3")],
r_y3=row[header.index(f"r_y3")],
)
cell = PageCell(
cell = PdfCell(
rect=rect,
text=row[header.index(f"text")],
orig=row[header.index(f"text")],
Expand All @@ -253,15 +255,15 @@ def _to_cells(self, cells: dict) -> List[PageCell]:

return result

def _to_images(self, images: dict) -> List[PageImage]:
def _to_images(self, images: dict) -> List[PdfBitmapResource]:

assert "data" in images, '"data" in images'
assert "header" in images, '"header" in images'

data = images["data"]
header = images["header"]

result: List[PageImage] = []
result: List[PdfBitmapResource] = []
for ind, row in enumerate(data):
rect = BoundingRectangle(
r_x0=row[header.index(f"x0")],
Expand All @@ -273,14 +275,14 @@ def _to_images(self, images: dict) -> List[PageImage]:
r_x3=row[header.index(f"x0")],
r_y3=row[header.index(f"y1")],
)
image = PageImage(ordering=ind, rect=rect, uri=None)
image = PdfBitmapResource(ordering=ind, rect=rect, uri=None)
result.append(image)

return result

def _to_lines(self, data: dict) -> List[PageLine]:
def _to_lines(self, data: dict) -> List[PdfLine]:

result: List[PageLine] = []
result: List[PdfLine] = []
for ind, item in enumerate(data):

for l in range(0, len(item["i"]), 2):
Expand All @@ -291,7 +293,7 @@ def _to_lines(self, data: dict) -> List[PageLine]:
for k in range(i0, i1):
points.append((item["x"][k], item["y"][k]))

line = PageLine(ordering=ind, line_parent_id=l, points=points)
line = PdfLine(ordering=ind, line_parent_id=l, points=points)
result.append(line)

return result
Expand All @@ -314,9 +316,9 @@ def _to_parsed_page(self, page: dict) -> ParsedPage:

def _to_parsed_paginated_document(
self, doc_dict: dict, page_no: int = 1
) -> ParsedPaginatedDocument:
) -> ParsedPdfDocument:

parsed_doc = ParsedPaginatedDocument()
parsed_doc = ParsedPdfDocument()

for pi, page in enumerate(doc_dict["pages"]):
parsed_doc.pages[page_no + pi] = self._to_parsed_page(page)
Expand Down
4 changes: 2 additions & 2 deletions tests/test_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@

import glob

from docling_parse.pdf_parser import pdf_parser
from docling_parse.pdf_parser import DoclingPdfParser

REGRESSION_FOLDER = "tests/data/regression/*.pdf"


def test_reference_documents_from_filenames_with_keys():

parser = pdf_parser(loglevel="fatal")
parser = DoclingPdfParser(loglevel="fatal")

pdf_docs = glob.glob(REGRESSION_FOLDER)

Expand Down

0 comments on commit 6cbd931

Please sign in to comment.