From b8f5e38a8c8b3fd734fa119cae216a3da0b363f7 Mon Sep 17 00:00:00 2001 From: Maxim Lysak <101627549+maxmnemonic@users.noreply.github.com> Date: Wed, 7 Aug 2024 16:22:36 +0200 Subject: [PATCH] feat: introducing docling_backend (#26) Uses our own docling_parse to reliably get PDF cells To get page images, this backend uses pypdfium2 Signed-off-by: Maxim Lysak Co-authored-by: Maxim Lysak --- docling/backend/docling_parse_backend.py | 171 +++++++++++++++++++++++ examples/convert.py | 10 +- poetry.lock | 27 +++- pyproject.toml | 1 + 4 files changed, 203 insertions(+), 6 deletions(-) create mode 100644 docling/backend/docling_parse_backend.py diff --git a/docling/backend/docling_parse_backend.py b/docling/backend/docling_parse_backend.py new file mode 100644 index 00000000..1e4bc638 --- /dev/null +++ b/docling/backend/docling_parse_backend.py @@ -0,0 +1,171 @@ +import random +from io import BytesIO +from pathlib import Path +from typing import Iterable, List, Optional, Union + +import pypdfium2 as pdfium +from docling_parse.docling_parse import pdf_parser +from PIL import Image, ImageDraw +from pypdfium2 import PdfPage + +from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend +from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize + + +class DoclingParsePageBackend(PdfPageBackend): + def __init__(self, page_obj: PdfPage, docling_page_obj): + super().__init__(page_obj) + self._ppage = page_obj + self._dpage = docling_page_obj + self.text_page = None + + def get_text_in_rect(self, bbox: BoundingBox) -> str: + # Find intersecting cells on the page + text_piece = "" + page_size = self.get_size() + parser_width = self._dpage["width"] + parser_height = self._dpage["height"] + + scale = ( + 1 # FIX - Replace with param in get_text_in_rect across backends (optional) + ) + + for i in range(len(self._dpage["cells"])): + rect = self._dpage["cells"][i]["box"]["device"] + x0, y0, x1, y1 = rect + cell_bbox = BoundingBox( + l=x0 * scale * page_size.width / parser_width, + b=y0 * scale * page_size.height / parser_height, + r=x1 * scale * page_size.width / parser_width, + t=y1 * scale * page_size.height / parser_height, + coord_origin=CoordOrigin.BOTTOMLEFT, + ).to_top_left_origin(page_size.height * scale) + + overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area() + + if overlap_frac > 0.5: + if len(text_piece) > 0: + text_piece += " " + text_piece += self._dpage["cells"][i]["content"]["rnormalized"] + + return text_piece + + def get_text_cells(self) -> Iterable[Cell]: + cells = [] + cell_counter = 0 + + page_size = self.get_size() + + parser_width = self._dpage["width"] + parser_height = self._dpage["height"] + + for i in range(len(self._dpage["cells"])): + rect = self._dpage["cells"][i]["box"]["device"] + x0, y0, x1, y1 = rect + text_piece = self._dpage["cells"][i]["content"]["rnormalized"] + cells.append( + Cell( + id=cell_counter, + text=text_piece, + bbox=BoundingBox( + # l=x0, b=y0, r=x1, t=y1, + l=x0 * page_size.width / parser_width, + b=y0 * page_size.height / parser_height, + r=x1 * page_size.width / parser_width, + t=y1 * page_size.height / parser_height, + coord_origin=CoordOrigin.BOTTOMLEFT, + ).to_top_left_origin(page_size.height), + ) + ) + cell_counter += 1 + + def draw_clusters_and_cells(): + image = self.get_page_image() + draw = ImageDraw.Draw(image) + for c in cells: + x0, y0, x1, y1 = c.bbox.as_tuple() + cell_color = ( + random.randint(30, 140), + random.randint(30, 140), + random.randint(30, 140), + ) + draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color) + image.show() + + # before merge: + # draw_clusters_and_cells() + + # cells = merge_horizontal_cells(cells) + + # after merge: + # draw_clusters_and_cells() + + return cells + + def get_page_image( + self, scale: int = 1, cropbox: Optional[BoundingBox] = None + ) -> Image.Image: + + page_size = self.get_size() + + if not cropbox: + cropbox = BoundingBox( + l=0, + r=page_size.width, + t=0, + b=page_size.height, + coord_origin=CoordOrigin.TOPLEFT, + ) + padbox = BoundingBox( + l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT + ) + else: + padbox = cropbox.to_bottom_left_origin(page_size.height) + padbox.r = page_size.width - padbox.r + padbox.t = page_size.height - padbox.t + + image = ( + self._ppage.render( + scale=scale * 1.5, + rotation=0, # no additional rotation + crop=padbox.as_tuple(), + ) + .to_pil() + .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale))) + ) # We resize the image from 1.5x the given scale to make it sharper. + + return image + + def get_size(self) -> PageSize: + return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height()) + + def unload(self): + self._ppage = None + self._dpage = None + self.text_page = None + + +class DoclingParseDocumentBackend(PdfDocumentBackend): + def __init__(self, path_or_stream: Iterable[Union[BytesIO, Path]]): + super().__init__(path_or_stream) + self._pdoc = pdfium.PdfDocument(path_or_stream) + # Parsing cells with docling_parser call + print("PARSING WITH DOCLING PARSE") + parser = pdf_parser() + self._parser_doc = parser.find_cells(str(path_or_stream)) + + def page_count(self) -> int: + return len(self._parser_doc["pages"]) + + def load_page(self, page_no: int) -> PdfPage: + return DoclingParsePageBackend( + self._pdoc[page_no], self._parser_doc["pages"][page_no] + ) + + def is_valid(self) -> bool: + return self.page_count() > 0 + + def unload(self): + self._pdoc.close() + self._pdoc = None + self._parser_doc = None diff --git a/examples/convert.py b/examples/convert.py index f197c20a..be216406 100644 --- a/examples/convert.py +++ b/examples/convert.py @@ -4,7 +4,8 @@ from pathlib import Path from typing import Iterable -from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend +# from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend +from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.datamodel.base_models import ConversionStatus, PipelineOptions from docling.datamodel.document import ConvertedDocument, DocumentConversionInput from docling.document_converter import DocumentConverter @@ -54,11 +55,12 @@ def main(): artifacts_path = DocumentConverter.download_models_hf() pipeline_options = PipelineOptions(do_table_structure=True) - # use text cells predicted from table structure model, instead of matching with pdf cells - pipeline_options.table_structure_options.do_cell_matching = False + pipeline_options.table_structure_options.do_cell_matching = True doc_converter = DocumentConverter( - artifacts_path=artifacts_path, pipeline_options=pipeline_options + artifacts_path=artifacts_path, + pipeline_options=pipeline_options, + pdf_backend=DoclingParseDocumentBackend, ) input = DocumentConversionInput.from_paths(input_doc_paths) diff --git a/poetry.lock b/poetry.lock index 4917d3b7..b2b36c7d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -759,6 +759,30 @@ torch = "2.2.2" torchvision = "0.17.2" tqdm = ">=4.64.0,<5.0.0" +[[package]] +name = "docling-parse" +version = "0.0.1" +description = "Simple package to extract text with coordinates from programmatic PDFs" +optional = false +python-versions = "<4.0,>=3.9" +files = [ + {file = "docling_parse-0.0.1-cp310-cp310-macosx_13_6_arm64.whl", hash = "sha256:d6301dde11157f94b6436bb87186b4723cce7b1e59e0f74b0a7333339d6f911d"}, + {file = "docling_parse-0.0.1-cp310-cp310-macosx_13_6_x86_64.whl", hash = "sha256:ac5fb3b6ac568159930103521f2e7002b78c37f6555f23d767b2e247ddbce740"}, + {file = "docling_parse-0.0.1-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:ec9066ad9e7f11a18aa230f67b733d64433185be1da8e887ac273c9683e02938"}, + {file = "docling_parse-0.0.1-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:3e5d560ac3527a9bda5bf01905ec6a5fb9eb889a5bec2c3c909cf9c75642e2d3"}, + {file = "docling_parse-0.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d56de1a5b45b19117d4fe1f444878501796ec5f17de880c06c1ce3184ac360e7"}, + {file = "docling_parse-0.0.1-cp311-cp311-macosx_13_6_arm64.whl", hash = "sha256:110a08f4663ee18833b2b89013993c2326b519a7fe21a64940d9f2789f52be29"}, + {file = "docling_parse-0.0.1-cp311-cp311-macosx_13_6_x86_64.whl", hash = "sha256:19cf275ce78d2ebb7c3e577b5126f1f2af6fd28557b63c42d1455f1cc87be454"}, + {file = "docling_parse-0.0.1-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:1fdd07ac20951935e3f74b1ec1f503c4493440664aaa8e30ab7fa6334c2a4937"}, + {file = "docling_parse-0.0.1-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:d8018263eba239c702f79149ed16ec4e749bdec5396aea9e78b9cdfbae1b86bd"}, + {file = "docling_parse-0.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:299281bfc14ca95cc1db677f48f152105be0f96beab171313004cdb7ce448df4"}, + {file = "docling_parse-0.0.1-cp312-cp312-macosx_13_6_arm64.whl", hash = "sha256:b05d40d6570212ca1e3b98fb55ce1c861d28484db2bde513b6c5e8b3339f4021"}, + {file = "docling_parse-0.0.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:11bdddc8f767bdd14b317bcb25d7fc46b656f867f137a5d8fe6d0f95d61d2ce9"}, + {file = "docling_parse-0.0.1-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:14a52b46c887c00b0a1da0f5ea4e6652ab9e23deeac43f6d98b239a6cba7fbf1"}, + {file = "docling_parse-0.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:17caa551f7432555823f01a4882e869068198a8b27eec1449afc6c821b594330"}, + {file = "docling_parse-0.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:27aac51dd7753fac57466fa5de55e0ff0294367cf62a539941e72cfff8fb7e87"}, +] + [[package]] name = "docutils" version = "0.21.2" @@ -2510,7 +2534,6 @@ description = "Nvidia JIT LTO Library" optional = false python-versions = ">=3" files = [ - {file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-manylinux2014_aarch64.whl", hash = "sha256:98103729cc5226e13ca319a10bbf9433bbbd44ef64fe72f45f067cacc14b8d27"}, {file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f9b37bc5c8cf7509665cb6ada5aaa0ce65618f2332b7d3e78e9790511f111212"}, {file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-win_amd64.whl", hash = "sha256:e782564d705ff0bf61ac3e1bf730166da66dd2fe9012f111ede5fc49b64ae697"}, ] @@ -4882,4 +4905,4 @@ ocr = ["easyocr"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "a6685d5cf1b283d805e10193a437662a1807f99dad40b56ab1e58e1b708fc184" +content-hash = "9dfea6fabd2b8be0183a671c1540446cadc1da45a5460e636c71ae5b24abee0d" diff --git a/pyproject.toml b/pyproject.toml index 03f59e05..46c25e90 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,7 @@ pydantic-settings = "^2.3.0" huggingface_hub = ">=0.23,<1" requests = "^2.32.3" easyocr = { version = "^1.7", optional = true } +docling-parse = "^0.0.1" [tool.poetry.group.dev.dependencies] black = {extras = ["jupyter"], version = "^24.4.2"}