Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Establish high-level DoclingPdfParser and PdfDocument APIs
Browse files Browse the repository at this point in the history
Signed-off-by: Christoph Auer <[email protected]>
cau-git committed Jan 15, 2025
1 parent 3aea044 commit 2950c3f
Showing 3 changed files with 62 additions and 44 deletions.
14 changes: 10 additions & 4 deletions docling_parse/document.py
Original file line number Diff line number Diff line change
@@ -3,7 +3,7 @@
import logging
import math
from enum import Enum
from typing import Dict, Iterator, List, Optional, Tuple, Union, Annotated, NamedTuple
from typing import Annotated, Dict, Iterator, List, NamedTuple, Optional, Tuple, Union

from docling_core.types.doc.base import BoundingBox, CoordOrigin
from PIL import Image as PILImage
@@ -18,6 +18,7 @@

ColorChannelValue = Annotated[int, Field(ge=0, le=255)]


class ColorRGBA(BaseModel):
r: ColorChannelValue
g: ColorChannelValue
@@ -29,10 +30,13 @@ def as_tuple(self) -> tuple[int, int, int, int]:

def __iter__(self):
yield from (self.r, self.g, self.b, self.a)


class Coord2D(NamedTuple):
x: float
y: float


class BoundingRectangle(BaseModel):

r_x0: float
@@ -134,9 +138,11 @@ def to_top_left_origin(self, page_height: float) -> "BoundingRectangle":
coord_origin=CoordOrigin.TOPLEFT,
)


class PdfBaseElement(BaseModel):
ordering: int


class PdfCell(PdfBaseElement):

rect: BoundingRectangle
@@ -152,7 +158,7 @@ class PdfCell(PdfBaseElement):

widget: bool

rgba: ColorRGBA = (0, 0, 0, 255)
rgba: ColorRGBA = ColorRGBA(r=0, g=0, b=0, a=255)


class PdfBitmapResource(PdfBaseElement):
@@ -163,8 +169,9 @@ class PdfBitmapResource(PdfBaseElement):

class PdfLine(PdfBaseElement):

#line_parent_id: int
# line_parent_id: int
points: List[Tuple[float, float]]
# line_parent_id: int

coord_origin: CoordOrigin = CoordOrigin.BOTTOMLEFT

@@ -519,7 +526,6 @@ class ParsedPage(BaseModel):

class ParsedPdfDocument(BaseModel):


pages: Dict[int, ParsedPage] = {}

def iterate_pages(
60 changes: 38 additions & 22 deletions docling_parse/pdf_parser.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
"""Parser for PDF files"""

import hashlib
from io import BytesIO
from pathlib import Path
from typing import List, Tuple, Union, Dict, Iterator
from typing import Dict, Iterator, List, Tuple, Union

from docling_core.types.doc.base import BoundingBox, CoordOrigin
from pydantic import BaseModel, TypeAdapter

from docling_parse.document import (
BoundingRectangle,
PageBoundaryType,
PdfCell,
PageDimension,
PdfBitmapResource,
PdfLine,
ParsedPage,
ParsedPdfDocument,
PdfBitmapResource,
PdfCell,
PdfLine,
SegmentedPage,
)
from docling_parse.pdf_parsers import pdf_parser_v2 # type: ignore[import]
@@ -29,7 +29,12 @@ def iterate_pages(
for page_no in range(self.number_of_pages()):
yield page_no + 1, self.get_page(page_no + 1)

def __init__(self, parser: "pdf_parser_v2", key: str, boundary_type: PageBoundaryType = PageBoundaryType.CROP_BOX):
def __init__(
self,
parser: "pdf_parser_v2",
key: str,
boundary_type: PageBoundaryType = PageBoundaryType.CROP_BOX,
):
self._parser: pdf_parser_v2 = parser
self._key = key
self._boundary_type = boundary_type
@@ -39,9 +44,12 @@ def is_loaded(self) -> bool:
return self._parser.is_loaded(key=self._key)

def unload(self) -> bool:
self._pages.clear()

if self.is_loaded():
return self._parser.unload_document(self._key)
self._pages.clear()
else:
return False

def number_of_pages(self) -> int:
if self.is_loaded():
@@ -57,8 +65,10 @@ def get_page(self, page_no: int):
doc_dict = self._parser.parse_pdf_from_key_on_page(
key=self._key, page=page_no - 1, page_boundary=self._boundary_type
)
for pi, page in enumerate(doc_dict["pages"]): # only one page is expected
self._pages[page_no] = self._to_parsed_page(page) # put on cache
for pi, page in enumerate(
doc_dict["pages"]
): # only one page is expected
self._pages[page_no] = self._to_parsed_page(page) # put on cache
return self._pages[page_no]

else:
@@ -225,7 +235,11 @@ def _to_lines(self, data: dict) -> List[PdfLine]:
for k in range(i0, i1):
points.append((item["x"][k], item["y"][k]))

line = PdfLine(ordering=ind, line_parent_id=l, points=points)
line = PdfLine(
ordering=ind,
points=points,
# line_parent_id=l,
)
result.append(line)

return result
@@ -280,7 +294,6 @@ def set_loglevel(self, loglevel: str):
"""
self.parser.set_loglevel_with_label(level=loglevel)


def list_loaded_keys(self) -> List[str]:
"""List the keys of the loaded documents.
@@ -289,16 +302,20 @@ def list_loaded_keys(self) -> List[str]:
"""
return self.parser.list_loaded_keys()

def load(self, path_or_stream: Union[str, Path, BytesIO], lazy: bool = True, boundary_type: PageBoundaryType = PageBoundaryType.CROP_BOX) -> PdfDocument:
#success: bool
#key: str
def load(
self,
path_or_stream: Union[str, Path, BytesIO],
lazy: bool = True,
boundary_type: PageBoundaryType = PageBoundaryType.CROP_BOX,
) -> PdfDocument:
# success: bool
# key: str

if isinstance(path_or_stream, str):
path_or_stream = Path(path_or_stream)


if isinstance(path_or_stream, Path):
key = f"key={str(path_or_stream)}" # use filepath as internal handle
key = f"key={str(path_or_stream)}" # use filepath as internal handle
success = self._load_document(key=key, filename=str(path_or_stream))

elif isinstance(path_or_stream, BytesIO):
@@ -309,19 +326,20 @@ def load(self, path_or_stream: Union[str, Path, BytesIO], lazy: bool = True, bou
path_or_stream.seek(0)
hash = hasher.hexdigest()

key = f"key={hash}" # use md5 hash as internal handle
key = f"key={hash}" # use md5 hash as internal handle
success = self._load_document_from_bytesio(key=key, data=path_or_stream)

if success:
result_doc = PdfDocument(parser=self.parser, key=key, boundary_type=boundary_type)
if not lazy: # eagerly parse the pages at init time if desired
result_doc = PdfDocument(
parser=self.parser, key=key, boundary_type=boundary_type
)
if not lazy: # eagerly parse the pages at init time if desired
result_doc.load_all_pages()

return result_doc
else:
raise RuntimeError(f"Failed to load document with key {key}")


def _load_document(self, key: str, filename: str) -> bool:
"""Load a document by key and filename.
@@ -345,5 +363,3 @@ def _load_document_from_bytesio(self, key: str, data: BytesIO) -> bool:
bool: True if the document was successfully loaded, False otherwise.)")
"""
return self.parser.load_document(key=key, bytes_io=data)


32 changes: 14 additions & 18 deletions tests/test_parse.py
Original file line number Diff line number Diff line change
@@ -22,29 +22,30 @@ def test_reference_documents_from_filenames():
for pdf_doc_path in pdf_docs:
print(pdf_doc_path)

pdf_doc: PdfDocument = parser.load(pagth_or_stream=pdf_doc_path,
boundary_type=PageBoundaryType.CROP_BOX, # default: CROP_BOX
lazy=False) # default: True
pdf_doc: PdfDocument = parser.load(
pagth_or_stream=pdf_doc_path,
boundary_type=PageBoundaryType.CROP_BOX, # default: CROP_BOX
lazy=False,
) # default: True
assert pdf_doc is not None

for page_no, page in pdf_doc.iterate_pages():
print(" -> Page ", page_no, end=" ")
print("has ", len(page.sanitized.cells), "cells.")
#res = page.original.render()
#res.show()
# res = page.original.render()
# res.show()

assert True


def test_load_lazy_or_eager():
filename = "tests/data/regression/table_of_contents_01.pdf"

parser = DoclingPdfParser(loglevel="fatal")

pdf_doc_case1: PdfDocument = parser.load(path_or_stream=filename,
lazy=True)
pdf_doc_case1: PdfDocument = parser.load(path_or_stream=filename, lazy=True)

pdf_doc_case2: PdfDocument = parser.load(path_or_stream=filename,
lazy=False)
pdf_doc_case2: PdfDocument = parser.load(path_or_stream=filename, lazy=False)

# The lazy doc has no pages populated, the eager one has them.
assert pdf_doc_case1._pages != pdf_doc_case2._pages
@@ -61,13 +62,9 @@ def test_load_two_distinct_docs():

parser = DoclingPdfParser(loglevel="fatal")

pdf_doc_case1: PdfDocument = parser.load(path_or_stream=filename1,
lazy=True)


pdf_doc_case2: PdfDocument = parser.load(path_or_stream=filename2,
lazy=True)
pdf_doc_case1: PdfDocument = parser.load(path_or_stream=filename1, lazy=True)

pdf_doc_case2: PdfDocument = parser.load(path_or_stream=filename2, lazy=True)

assert pdf_doc_case1.number_of_pages() != pdf_doc_case2.number_of_pages()

@@ -76,13 +73,13 @@ def test_load_two_distinct_docs():

assert pdf_doc_case1._pages != pdf_doc_case2._pages


def test_serialize_and_reload():
filename = "tests/data/regression/table_of_contents_01.pdf"

parser = DoclingPdfParser(loglevel="fatal")

pdf_doc: PdfDocument = parser.load(path_or_stream=filename,
lazy=True)
pdf_doc: PdfDocument = parser.load(path_or_stream=filename, lazy=True)

# TODO a proper serialization model must be still established for a full PdfDocument

@@ -92,4 +89,3 @@ def test_serialize_and_reload():
reloaded_pages: Dict[int, ParsedPage] = page_adapter.validate_json(json_pages)

assert reloaded_pages == pdf_doc._pages

0 comments on commit 2950c3f

Please sign in to comment.