From 78347bf679c393378eab0bd383929fced88afeae Mon Sep 17 00:00:00 2001 From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Date: Tue, 20 Aug 2024 13:27:19 +0200 Subject: [PATCH] feat: allow computing page images on-demand with scale and cache them (#36) * feat: allow computing page images on-demand and cache them Signed-off-by: Michele Dolfi * feat: expose scale for export of page images and document elements Signed-off-by: Michele Dolfi * fix comment Signed-off-by: Michele Dolfi --------- Signed-off-by: Michele Dolfi --- docling/backend/docling_parse_backend.py | 4 +- docling/backend/pypdfium2_backend.py | 4 +- docling/datamodel/base_models.py | 51 ++++++++++++++---- docling/datamodel/document.py | 17 +++++- docling/document_converter.py | 18 ++++--- docling/models/easyocr_model.py | 2 +- docling/models/layout_model.py | 4 +- docling/models/table_structure_model.py | 12 ++--- examples/export_figures.py | 69 ++++++++---------------- 9 files changed, 104 insertions(+), 77 deletions(-) diff --git a/docling/backend/docling_parse_backend.py b/docling/backend/docling_parse_backend.py index 09ec5028..cea38df2 100644 --- a/docling/backend/docling_parse_backend.py +++ b/docling/backend/docling_parse_backend.py @@ -84,7 +84,9 @@ def get_text_cells(self) -> Iterable[Cell]: cell_counter += 1 def draw_clusters_and_cells(): - image = self.get_page_image() + image = ( + self.get_page_image() + ) # make new image to avoid drawing on the saved ones draw = ImageDraw.Draw(image) for c in cells: x0, y0, x1, y1 = c.bbox.as_tuple() diff --git a/docling/backend/pypdfium2_backend.py b/docling/backend/pypdfium2_backend.py index e5540f4c..33f059df 100644 --- a/docling/backend/pypdfium2_backend.py +++ b/docling/backend/pypdfium2_backend.py @@ -134,7 +134,9 @@ def merge_group(group: List[Cell]) -> Cell: return merged_cells def draw_clusters_and_cells(): - image = self.get_page_image() + image = ( + self.get_page_image() + ) # make new image to avoid drawing on the saved ones draw = ImageDraw.Draw(image) for c in cells: x0, y0, x1, y1 = c.bbox.as_tuple() diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index 10086917..a4046a65 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -1,10 +1,12 @@ import copy +import warnings from enum import Enum, auto from io import BytesIO -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Annotated, Any, Dict, List, Optional, Tuple, Union from PIL.Image import Image -from pydantic import BaseModel, ConfigDict, model_validator +from pydantic import BaseModel, ConfigDict, Field, model_validator +from typing_extensions import Self from docling.backend.abstract_backend import PdfPageBackend @@ -234,14 +236,30 @@ class Page(BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) page_no: int - page_hash: str = None - size: PageSize = None - image: Image = None + page_hash: Optional[str] = None + size: Optional[PageSize] = None cells: List[Cell] = None predictions: PagePredictions = PagePredictions() - assembled: AssembledUnit = None + assembled: Optional[AssembledUnit] = None - _backend: PdfPageBackend = None # Internal PDF backend + _backend: Optional[PdfPageBackend] = ( + None # Internal PDF backend. By default it is cleared during assembling. + ) + _default_image_scale: float = 1.0 # Default image scale for external usage. + _image_cache: Dict[float, Image] = ( + {} + ) # Cache of images in different scales. By default it is cleared during assembling. + + def get_image(self, scale: float = 1.0) -> Optional[Image]: + if self._backend is None: + return self._image_cache.get(scale, None) + if not scale in self._image_cache: + self._image_cache[scale] = self._backend.get_page_image(scale=scale) + return self._image_cache[scale] + + @property + def image(self) -> Optional[Image]: + return self.get_image(scale=self._default_image_scale) class DocumentStream(BaseModel): @@ -268,6 +286,19 @@ class PipelineOptions(BaseModel): class AssembleOptions(BaseModel): - keep_page_images: bool = ( - False # False: page images are removed in the assemble step - ) + keep_page_images: Annotated[ + bool, + Field( + deprecated="`keep_page_images` is depreacted, set the value of `images_scale` instead" + ), + ] = False # False: page images are removed in the assemble step + images_scale: Optional[float] = None # if set, the scale for generated images + + @model_validator(mode="after") + def set_page_images_from_deprecated(self) -> Self: + with warnings.catch_warnings(): + warnings.simplefilter("ignore", DeprecationWarning) + default_scale = 1.0 + if self.keep_page_images and self.images_scale is None: + self.images_scale = default_scale + return self diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index cc11c331..fe19afbc 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -1,7 +1,7 @@ import logging from io import BytesIO from pathlib import Path, PurePath -from typing import ClassVar, Dict, Iterable, List, Optional, Type, Union +from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union from docling_core.types import BaseCell, BaseText from docling_core.types import BoundingBox as DsBoundingBox @@ -21,6 +21,7 @@ DocumentStream, FigureElement, Page, + PageElement, TableElement, TextElement, ) @@ -302,6 +303,20 @@ def render_as_markdown(self): else: return "" + def render_element_images( + self, element_types: Tuple[PageElement] = (FigureElement,) + ): + for element in self.assembled.elements: + if isinstance(element, element_types): + page_ix = element.page_no + scale = self.pages[page_ix]._default_image_scale + crop_bbox = element.cluster.bbox.scaled(scale=scale).to_top_left_origin( + page_height=self.pages[page_ix].size.height * scale + ) + + cropped_im = self.pages[page_ix].image.crop(crop_bbox.as_tuple()) + yield element, cropped_im + class DocumentConversionInput(BaseModel): diff --git a/docling/document_converter.py b/docling/document_converter.py index 9954bc9b..bcda748f 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -188,10 +188,8 @@ def process_document(self, in_doc: InputDocument) -> ConvertedDocument: # Free up mem resources before moving on with next batch # Remove page images (can be disabled) - if not self.assemble_options.keep_page_images: - assembled_page.image = ( - None # Comment this if you want to visualize page images - ) + if self.assemble_options.images_scale is None: + assembled_page._image_cache = {} # Unload backend assembled_page._backend.unload() @@ -231,7 +229,15 @@ def initialize_page(self, doc: InputDocument, page: Page) -> Page: # Generate the page image and store it in the page object def populate_page_images(self, doc: InputDocument, page: Page) -> Page: - page.image = page._backend.get_page_image() + # default scale + page.get_image(scale=1.0) + + # user requested scales + if self.assemble_options.images_scale is not None: + page._default_image_scale = self.assemble_options.images_scale + page.get_image( + scale=self.assemble_options.images_scale + ) # this will trigger storing the image in the internal cache return page @@ -247,7 +253,7 @@ def draw_text_boxes(image, cells): draw.rectangle([(x0, y0), (x1, y1)], outline="red") image.show() - # draw_text_boxes(page.image, cells) + # draw_text_boxes(page.get_image(scale=1.0), cells) return page diff --git a/docling/models/easyocr_model.py b/docling/models/easyocr_model.py index d5bca501..d9452ce6 100644 --- a/docling/models/easyocr_model.py +++ b/docling/models/easyocr_model.py @@ -30,7 +30,7 @@ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: for page in page_batch: # rects = page._fpage. - high_res_image = page._backend.get_page_image(scale=self.scale) + high_res_image = page.get_image(scale=self.scale) im = numpy.array(high_res_image) result = self.reader.readtext(im) diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py index 93f80d54..af7b8e7b 100644 --- a/docling/models/layout_model.py +++ b/docling/models/layout_model.py @@ -267,7 +267,9 @@ def postprocess(self, clusters: List[Cluster], cells: List[Cell], page_height): def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: for page in page_batch: clusters = [] - for ix, pred_item in enumerate(self.layout_predictor.predict(page.image)): + for ix, pred_item in enumerate( + self.layout_predictor.predict(page.get_image(scale=1.0)) + ): cluster = Cluster( id=ix, label=pred_item["label"], diff --git a/docling/models/table_structure_model.py b/docling/models/table_structure_model.py index 09c789d2..f7d03cb9 100644 --- a/docling/models/table_structure_model.py +++ b/docling/models/table_structure_model.py @@ -34,7 +34,9 @@ def __init__(self, config): self.scale = 2.0 # Scale up table input images to 144 dpi def draw_table_and_cells(self, page: Page, tbl_list: List[TableElement]): - image = page._backend.get_page_image() + image = ( + page._backend.get_page_image() + ) # make new image to avoid drawing on the saved ones draw = ImageDraw.Draw(image) for table_element in tbl_list: @@ -94,13 +96,7 @@ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: "width": page.size.width * self.scale, "height": page.size.height * self.scale, } - # add image to page input. - if self.scale == 1.0: - page_input["image"] = numpy.asarray(page.image) - else: # render new page image on the fly at desired scale - page_input["image"] = numpy.asarray( - page._backend.get_page_image(scale=self.scale) - ) + page_input["image"] = numpy.asarray(page.get_image(scale=self.scale)) table_clusters, table_bboxes = zip(*in_tables) diff --git a/examples/export_figures.py b/examples/export_figures.py index 6cd98430..d71a60eb 100644 --- a/examples/export_figures.py +++ b/examples/export_figures.py @@ -15,44 +15,7 @@ _log = logging.getLogger(__name__) - -def export_page_images( - doc: ConvertedDocument, - output_dir: Path, -): - output_dir.mkdir(parents=True, exist_ok=True) - - doc_filename = doc.input.file.stem - - for page in doc.pages: - page_no = page.page_no + 1 - page_image_filename = output_dir / f"{doc_filename}-{page_no}.png" - with page_image_filename.open("wb") as fp: - page.image.save(fp, format="PNG") - - -def export_element_images( - doc: ConvertedDocument, - output_dir: Path, - allowed_element_types: Tuple[PageElement] = (FigureElement,), -): - output_dir.mkdir(parents=True, exist_ok=True) - - doc_filename = doc.input.file.stem - - for element_ix, element in enumerate(doc.assembled.elements): - if isinstance(element, allowed_element_types): - page_ix = element.page_no - crop_bbox = element.cluster.bbox.to_top_left_origin( - page_height=doc.pages[page_ix].size.height - ) - - cropped_im = doc.pages[page_ix].image.crop(crop_bbox.as_tuple()) - element_image_filename = ( - output_dir / f"{doc_filename}-element-{element_ix}.png" - ) - with element_image_filename.open("wb") as fp: - cropped_im.save(fp, "PNG") +IMAGE_RESOLUTION_SCALE = 2.0 def main(): @@ -61,13 +24,16 @@ def main(): input_doc_paths = [ Path("./test/data/2206.01062.pdf"), ] + output_dir = Path("./scratch") input_files = DocumentConversionInput.from_paths(input_doc_paths) # Important: For operating with page images, we must keep them, otherwise the DocumentConverter # will destroy them for cleaning up memory. + # This is done by setting AssembleOptions.images_scale, which also defines the scale of images. + # scale=1 correspond of a standard 72 DPI image assemble_options = AssembleOptions() - assemble_options.keep_page_images = True + assemble_options.images_scale = IMAGE_RESOLUTION_SCALE doc_converter = DocumentConverter(assemble_options=assemble_options) @@ -75,23 +41,30 @@ def main(): converted_docs = doc_converter.convert(input_files) + output_dir.mkdir(parents=True, exist_ok=True) for doc in converted_docs: if doc.status != ConversionStatus.SUCCESS: _log.info(f"Document {doc.input.file} failed to convert.") continue - # Export page images - export_page_images(doc, output_dir=Path("./scratch")) + doc_filename = doc.input.file.stem - # Export figures - # export_element_images(doc, output_dir=Path("./scratch"), allowed_element_types=(FigureElement,)) + # Export page images + for page in doc.pages: + page_no = page.page_no + 1 + page_image_filename = output_dir / f"{doc_filename}-{page_no}.png" + with page_image_filename.open("wb") as fp: + page.image.save(fp, format="PNG") # Export figures and tables - export_element_images( - doc, - output_dir=Path("./scratch"), - allowed_element_types=(FigureElement, TableElement), - ) + for element, image in doc.render_element_images( + element_types=(FigureElement, TableElement) + ): + element_image_filename = ( + output_dir / f"{doc_filename}-element-{element.id}.png" + ) + with element_image_filename.open("wb") as fp: + image.save(fp, "PNG") end_time = time.time() - start_time