Skip to content

Commit

Permalink
feat: allow computing page images on-demand with scale and cache them (
Browse files Browse the repository at this point in the history
…#36)

* feat: allow computing page images on-demand and cache them

Signed-off-by: Michele Dolfi <[email protected]>

* feat: expose scale for export of page images and document elements

Signed-off-by: Michele Dolfi <[email protected]>

* fix comment

Signed-off-by: Michele Dolfi <[email protected]>

---------

Signed-off-by: Michele Dolfi <[email protected]>
  • Loading branch information
dolfim-ibm authored Aug 20, 2024
1 parent c253dd7 commit 78347bf
Show file tree
Hide file tree
Showing 9 changed files with 104 additions and 77 deletions.
4 changes: 3 additions & 1 deletion docling/backend/docling_parse_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,9 @@ def get_text_cells(self) -> Iterable[Cell]:
cell_counter += 1

def draw_clusters_and_cells():
image = self.get_page_image()
image = (
self.get_page_image()
) # make new image to avoid drawing on the saved ones
draw = ImageDraw.Draw(image)
for c in cells:
x0, y0, x1, y1 = c.bbox.as_tuple()
Expand Down
4 changes: 3 additions & 1 deletion docling/backend/pypdfium2_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,9 @@ def merge_group(group: List[Cell]) -> Cell:
return merged_cells

def draw_clusters_and_cells():
image = self.get_page_image()
image = (
self.get_page_image()
) # make new image to avoid drawing on the saved ones
draw = ImageDraw.Draw(image)
for c in cells:
x0, y0, x1, y1 = c.bbox.as_tuple()
Expand Down
51 changes: 41 additions & 10 deletions docling/datamodel/base_models.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import copy
import warnings
from enum import Enum, auto
from io import BytesIO
from typing import Any, Dict, List, Optional, Tuple, Union
from typing import Annotated, Any, Dict, List, Optional, Tuple, Union

from PIL.Image import Image
from pydantic import BaseModel, ConfigDict, model_validator
from pydantic import BaseModel, ConfigDict, Field, model_validator
from typing_extensions import Self

from docling.backend.abstract_backend import PdfPageBackend

Expand Down Expand Up @@ -234,14 +236,30 @@ class Page(BaseModel):
model_config = ConfigDict(arbitrary_types_allowed=True)

page_no: int
page_hash: str = None
size: PageSize = None
image: Image = None
page_hash: Optional[str] = None
size: Optional[PageSize] = None
cells: List[Cell] = None
predictions: PagePredictions = PagePredictions()
assembled: AssembledUnit = None
assembled: Optional[AssembledUnit] = None

_backend: PdfPageBackend = None # Internal PDF backend
_backend: Optional[PdfPageBackend] = (
None # Internal PDF backend. By default it is cleared during assembling.
)
_default_image_scale: float = 1.0 # Default image scale for external usage.
_image_cache: Dict[float, Image] = (
{}
) # Cache of images in different scales. By default it is cleared during assembling.

def get_image(self, scale: float = 1.0) -> Optional[Image]:
if self._backend is None:
return self._image_cache.get(scale, None)
if not scale in self._image_cache:
self._image_cache[scale] = self._backend.get_page_image(scale=scale)
return self._image_cache[scale]

@property
def image(self) -> Optional[Image]:
return self.get_image(scale=self._default_image_scale)


class DocumentStream(BaseModel):
Expand All @@ -268,6 +286,19 @@ class PipelineOptions(BaseModel):


class AssembleOptions(BaseModel):
keep_page_images: bool = (
False # False: page images are removed in the assemble step
)
keep_page_images: Annotated[
bool,
Field(
deprecated="`keep_page_images` is depreacted, set the value of `images_scale` instead"
),
] = False # False: page images are removed in the assemble step
images_scale: Optional[float] = None # if set, the scale for generated images

@model_validator(mode="after")
def set_page_images_from_deprecated(self) -> Self:
with warnings.catch_warnings():
warnings.simplefilter("ignore", DeprecationWarning)
default_scale = 1.0
if self.keep_page_images and self.images_scale is None:
self.images_scale = default_scale
return self
17 changes: 16 additions & 1 deletion docling/datamodel/document.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging
from io import BytesIO
from pathlib import Path, PurePath
from typing import ClassVar, Dict, Iterable, List, Optional, Type, Union
from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union

from docling_core.types import BaseCell, BaseText
from docling_core.types import BoundingBox as DsBoundingBox
Expand All @@ -21,6 +21,7 @@
DocumentStream,
FigureElement,
Page,
PageElement,
TableElement,
TextElement,
)
Expand Down Expand Up @@ -302,6 +303,20 @@ def render_as_markdown(self):
else:
return ""

def render_element_images(
self, element_types: Tuple[PageElement] = (FigureElement,)
):
for element in self.assembled.elements:
if isinstance(element, element_types):
page_ix = element.page_no
scale = self.pages[page_ix]._default_image_scale
crop_bbox = element.cluster.bbox.scaled(scale=scale).to_top_left_origin(
page_height=self.pages[page_ix].size.height * scale
)

cropped_im = self.pages[page_ix].image.crop(crop_bbox.as_tuple())
yield element, cropped_im


class DocumentConversionInput(BaseModel):

Expand Down
18 changes: 12 additions & 6 deletions docling/document_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,10 +188,8 @@ def process_document(self, in_doc: InputDocument) -> ConvertedDocument:
# Free up mem resources before moving on with next batch

# Remove page images (can be disabled)
if not self.assemble_options.keep_page_images:
assembled_page.image = (
None # Comment this if you want to visualize page images
)
if self.assemble_options.images_scale is None:
assembled_page._image_cache = {}

# Unload backend
assembled_page._backend.unload()
Expand Down Expand Up @@ -231,7 +229,15 @@ def initialize_page(self, doc: InputDocument, page: Page) -> Page:

# Generate the page image and store it in the page object
def populate_page_images(self, doc: InputDocument, page: Page) -> Page:
page.image = page._backend.get_page_image()
# default scale
page.get_image(scale=1.0)

# user requested scales
if self.assemble_options.images_scale is not None:
page._default_image_scale = self.assemble_options.images_scale
page.get_image(
scale=self.assemble_options.images_scale
) # this will trigger storing the image in the internal cache

return page

Expand All @@ -247,7 +253,7 @@ def draw_text_boxes(image, cells):
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
image.show()

# draw_text_boxes(page.image, cells)
# draw_text_boxes(page.get_image(scale=1.0), cells)

return page

Expand Down
2 changes: 1 addition & 1 deletion docling/models/easyocr_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:

for page in page_batch:
# rects = page._fpage.
high_res_image = page._backend.get_page_image(scale=self.scale)
high_res_image = page.get_image(scale=self.scale)
im = numpy.array(high_res_image)
result = self.reader.readtext(im)

Expand Down
4 changes: 3 additions & 1 deletion docling/models/layout_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,9 @@ def postprocess(self, clusters: List[Cluster], cells: List[Cell], page_height):
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
for page in page_batch:
clusters = []
for ix, pred_item in enumerate(self.layout_predictor.predict(page.image)):
for ix, pred_item in enumerate(
self.layout_predictor.predict(page.get_image(scale=1.0))
):
cluster = Cluster(
id=ix,
label=pred_item["label"],
Expand Down
12 changes: 4 additions & 8 deletions docling/models/table_structure_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,9 @@ def __init__(self, config):
self.scale = 2.0 # Scale up table input images to 144 dpi

def draw_table_and_cells(self, page: Page, tbl_list: List[TableElement]):
image = page._backend.get_page_image()
image = (
page._backend.get_page_image()
) # make new image to avoid drawing on the saved ones
draw = ImageDraw.Draw(image)

for table_element in tbl_list:
Expand Down Expand Up @@ -94,13 +96,7 @@ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
"width": page.size.width * self.scale,
"height": page.size.height * self.scale,
}
# add image to page input.
if self.scale == 1.0:
page_input["image"] = numpy.asarray(page.image)
else: # render new page image on the fly at desired scale
page_input["image"] = numpy.asarray(
page._backend.get_page_image(scale=self.scale)
)
page_input["image"] = numpy.asarray(page.get_image(scale=self.scale))

table_clusters, table_bboxes = zip(*in_tables)

Expand Down
69 changes: 21 additions & 48 deletions examples/export_figures.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,44 +15,7 @@

_log = logging.getLogger(__name__)


def export_page_images(
doc: ConvertedDocument,
output_dir: Path,
):
output_dir.mkdir(parents=True, exist_ok=True)

doc_filename = doc.input.file.stem

for page in doc.pages:
page_no = page.page_no + 1
page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
with page_image_filename.open("wb") as fp:
page.image.save(fp, format="PNG")


def export_element_images(
doc: ConvertedDocument,
output_dir: Path,
allowed_element_types: Tuple[PageElement] = (FigureElement,),
):
output_dir.mkdir(parents=True, exist_ok=True)

doc_filename = doc.input.file.stem

for element_ix, element in enumerate(doc.assembled.elements):
if isinstance(element, allowed_element_types):
page_ix = element.page_no
crop_bbox = element.cluster.bbox.to_top_left_origin(
page_height=doc.pages[page_ix].size.height
)

cropped_im = doc.pages[page_ix].image.crop(crop_bbox.as_tuple())
element_image_filename = (
output_dir / f"{doc_filename}-element-{element_ix}.png"
)
with element_image_filename.open("wb") as fp:
cropped_im.save(fp, "PNG")
IMAGE_RESOLUTION_SCALE = 2.0


def main():
Expand All @@ -61,37 +24,47 @@ def main():
input_doc_paths = [
Path("./test/data/2206.01062.pdf"),
]
output_dir = Path("./scratch")

input_files = DocumentConversionInput.from_paths(input_doc_paths)

# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
# will destroy them for cleaning up memory.
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
# scale=1 correspond of a standard 72 DPI image
assemble_options = AssembleOptions()
assemble_options.keep_page_images = True
assemble_options.images_scale = IMAGE_RESOLUTION_SCALE

doc_converter = DocumentConverter(assemble_options=assemble_options)

start_time = time.time()

converted_docs = doc_converter.convert(input_files)

output_dir.mkdir(parents=True, exist_ok=True)
for doc in converted_docs:
if doc.status != ConversionStatus.SUCCESS:
_log.info(f"Document {doc.input.file} failed to convert.")
continue

# Export page images
export_page_images(doc, output_dir=Path("./scratch"))
doc_filename = doc.input.file.stem

# Export figures
# export_element_images(doc, output_dir=Path("./scratch"), allowed_element_types=(FigureElement,))
# Export page images
for page in doc.pages:
page_no = page.page_no + 1
page_image_filename = output_dir / f"{doc_filename}-{page_no}.png"
with page_image_filename.open("wb") as fp:
page.image.save(fp, format="PNG")

# Export figures and tables
export_element_images(
doc,
output_dir=Path("./scratch"),
allowed_element_types=(FigureElement, TableElement),
)
for element, image in doc.render_element_images(
element_types=(FigureElement, TableElement)
):
element_image_filename = (
output_dir / f"{doc_filename}-element-{element.id}.png"
)
with element_image_filename.open("wb") as fp:
image.save(fp, "PNG")

end_time = time.time() - start_time

Expand Down

0 comments on commit 78347bf

Please sign in to comment.