Skip to content

Commit

Permalink
test: improve typing definitions (part 1) (#72)
Browse files Browse the repository at this point in the history
Signed-off-by: Michele Dolfi <[email protected]>
  • Loading branch information
dolfim-ibm authored Sep 12, 2024
1 parent 53569a1 commit 8aa476c
Show file tree
Hide file tree
Showing 9 changed files with 91 additions and 29 deletions.
9 changes: 6 additions & 3 deletions docling/backend/abstract_backend.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
from abc import ABC, abstractmethod
from io import BytesIO
from pathlib import Path
from typing import Any, Iterable, Optional, Union
from typing import TYPE_CHECKING, Any, Iterable, Optional, Union

from PIL import Image

if TYPE_CHECKING:
from docling.datamodel.base_models import BoundingBox, Cell, PageSize


class PdfPageBackend(ABC):

Expand All @@ -17,12 +20,12 @@ def get_text_cells(self) -> Iterable["Cell"]:
pass

@abstractmethod
def get_bitmap_rects(self, scale: int = 1) -> Iterable["BoundingBox"]:
def get_bitmap_rects(self, float: int = 1) -> Iterable["BoundingBox"]:
pass

@abstractmethod
def get_page_image(
self, scale: int = 1, cropbox: Optional["BoundingBox"] = None
self, scale: float = 1, cropbox: Optional["BoundingBox"] = None
) -> Image.Image:
pass

Expand Down
9 changes: 4 additions & 5 deletions docling/backend/docling_parse_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import random
from io import BytesIO
from pathlib import Path
from typing import Iterable, Optional, Union
from typing import Iterable, List, Optional, Union

import pypdfium2 as pdfium
from docling_parse.docling_parse import pdf_parser
Expand All @@ -22,7 +22,6 @@ def __init__(
self._ppage = page_obj
parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)

self._dpage = None
self.valid = "pages" in parsed_page
if self.valid:
self._dpage = parsed_page["pages"][0]
Expand Down Expand Up @@ -68,7 +67,7 @@ def get_text_in_rect(self, bbox: BoundingBox) -> str:
return text_piece

def get_text_cells(self) -> Iterable[Cell]:
cells = []
cells: List[Cell] = []
cell_counter = 0

if not self.valid:
Expand Down Expand Up @@ -130,7 +129,7 @@ def draw_clusters_and_cells():

return cells

def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]:
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
AREA_THRESHOLD = 32 * 32

for i in range(len(self._dpage["images"])):
Expand All @@ -145,7 +144,7 @@ def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]:
yield cropbox

def get_page_image(
self, scale: int = 1, cropbox: Optional[BoundingBox] = None
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
) -> Image.Image:

page_size = self.get_size()
Expand Down
8 changes: 4 additions & 4 deletions docling/backend/pypdfium2_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import pypdfium2 as pdfium
import pypdfium2.raw as pdfium_c
from PIL import Image, ImageDraw
from pypdfium2 import PdfPage
from pypdfium2 import PdfPage, PdfTextPage
from pypdfium2._helpers.misc import PdfiumError

from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
Expand All @@ -29,12 +29,12 @@ def __init__(
exc_info=True,
)
self.valid = False
self.text_page = None
self.text_page: Optional[PdfTextPage] = None

def is_valid(self) -> bool:
return self.valid

def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]:
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
AREA_THRESHOLD = 32 * 32
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
pos = obj.get_pos()
Expand Down Expand Up @@ -189,7 +189,7 @@ def draw_clusters_and_cells():
return cells

def get_page_image(
self, scale: int = 1, cropbox: Optional[BoundingBox] = None
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
) -> Image.Image:

page_size = self.get_size()
Expand Down
6 changes: 3 additions & 3 deletions docling/datamodel/base_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def as_tuple(self):
return (self.l, self.b, self.r, self.t)

@classmethod
def from_tuple(cls, coord: Tuple[float], origin: CoordOrigin):
def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
if origin == CoordOrigin.TOPLEFT:
l, t, r, b = coord[0], coord[1], coord[2], coord[3]
if r < l:
Expand Down Expand Up @@ -246,7 +246,7 @@ class EquationPrediction(BaseModel):


class PagePredictions(BaseModel):
layout: LayoutPrediction = None
layout: Optional[LayoutPrediction] = None
tablestructure: Optional[TableStructurePrediction] = None
figures_classification: Optional[FigureClassificationPrediction] = None
equations_prediction: Optional[EquationPrediction] = None
Expand All @@ -267,7 +267,7 @@ class Page(BaseModel):
page_no: int
page_hash: Optional[str] = None
size: Optional[PageSize] = None
cells: List[Cell] = None
cells: List[Cell] = []
predictions: PagePredictions = PagePredictions()
assembled: Optional[AssembledUnit] = None

Expand Down
4 changes: 2 additions & 2 deletions docling/pipeline/base_model_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
from pathlib import Path
from typing import Iterable
from typing import Callable, Iterable, List

from docling.datamodel.base_models import Page, PipelineOptions


class BaseModelPipeline:
def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
self.model_pipe = []
self.model_pipe: List[Callable] = []
self.artifacts_path = artifacts_path
self.pipeline_options = pipeline_options

Expand Down
29 changes: 19 additions & 10 deletions docling/utils/export.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import logging
from typing import Any, Dict, Iterable, List, Tuple
from typing import Any, Dict, Iterable, List, Tuple, Union

from docling_core.types.doc.base import BaseCell, Ref, Table, TableCell
from docling_core.types.doc.base import BaseCell, BaseText, Ref, Table, TableCell

from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell
from docling.datamodel.document import ConvertedDocument, Page
from docling.datamodel.document import ConversionResult, Page

_log = logging.getLogger(__name__)

Expand All @@ -15,7 +15,10 @@ def _export_table_to_html(table: Table):
# to the docling-core package.

def _get_tablecell_span(cell: TableCell, ix):
span = set([s[ix] for s in cell.spans])
if cell.spans is None:
span = set()
else:
span = set([s[ix] for s in cell.spans])
if len(span) == 0:
return 1, None, None
return len(span), min(span), max(span)
Expand All @@ -24,6 +27,8 @@ def _get_tablecell_span(cell: TableCell, ix):
nrows = table.num_rows
ncols = table.num_cols

if table.data is None:
return ""
for i in range(nrows):
body += "<tr>"
for j in range(ncols):
Expand Down Expand Up @@ -66,7 +71,7 @@ def _get_tablecell_span(cell: TableCell, ix):


def generate_multimodal_pages(
doc_result: ConvertedDocument,
doc_result: ConversionResult,
) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]:

label_to_doclaynet = {
Expand Down Expand Up @@ -94,7 +99,7 @@ def generate_multimodal_pages(
page_no = 0
start_ix = 0
end_ix = 0
doc_items = []
doc_items: List[Tuple[int, Union[BaseCell, BaseText]]] = []

doc = doc_result.output

Expand All @@ -105,11 +110,11 @@ def _process_page_segments(doc_items: list[Tuple[int, BaseCell]], page: Page):
item_type = item.obj_type
label = label_to_doclaynet.get(item_type, None)

if label is None:
if label is None or item.prov is None or page.size is None:
continue

bbox = BoundingBox.from_tuple(
item.prov[0].bbox, origin=CoordOrigin.BOTTOMLEFT
tuple(item.prov[0].bbox), origin=CoordOrigin.BOTTOMLEFT
)
new_bbox = bbox.to_top_left_origin(page_height=page.size.height).normalized(
page_size=page.size
Expand Down Expand Up @@ -137,13 +142,15 @@ def _process_page_segments(doc_items: list[Tuple[int, BaseCell]], page: Page):
return segments

def _process_page_cells(page: Page):
cells = []
cells: List[dict] = []
if page.size is None:
return cells
for cell in page.cells:
new_bbox = cell.bbox.to_top_left_origin(
page_height=page.size.height
).normalized(page_size=page.size)
is_ocr = isinstance(cell, OcrCell)
ocr_confidence = cell.confidence if is_ocr else 1.0
ocr_confidence = cell.confidence if isinstance(cell, OcrCell) else 1.0
cells.append(
{
"text": cell.text,
Expand All @@ -170,6 +177,8 @@ def _process_page():

return content_text, content_md, content_dt, page_cells, page_segments, page

if doc.main_text is None:
return
for ix, orig_item in enumerate(doc.main_text):

item = doc._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item
Expand Down
33 changes: 32 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 9 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ pytest-xdist = "^3.3.1"
types-requests = "^2.31.0.2"
flake8-pyproject = "^1.2.3"
pylint = "^2.17.5"
pandas-stubs = "^2.2.2.240909"
ipykernel = "^6.29.5"
ipywidgets = "^8.1.5"
nbqa = "^1.9.0"
Expand Down Expand Up @@ -114,6 +115,14 @@ pretty = true
no_implicit_optional = true
python_version = "3.10"

[[tool.mypy.overrides]]
module = [
"docling_parse.*",
"pypdfium2.*",
"networkx.*",
]
ignore_missing_imports = true

[tool.flake8]
max-line-length = 88
extend-ignore = ["E203", "E501"]
Expand Down
13 changes: 12 additions & 1 deletion tests/verify_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ def verify_cells(doc_pred_pages: List[Page], doc_true_pages: List[Page]):


def verify_maintext(doc_pred: DsDocument, doc_true: DsDocument):
assert doc_true.main_text is not None, "doc_true cannot be None"
assert doc_pred.main_text is not None, "doc_true cannot be None"

assert len(doc_true.main_text) == len(
doc_pred.main_text
Expand All @@ -68,6 +70,13 @@ def verify_maintext(doc_pred: DsDocument, doc_true: DsDocument):


def verify_tables(doc_pred: DsDocument, doc_true: DsDocument):
if doc_true.tables is None:
# No tables to check
assert doc_pred.tables is None, "not expecting any table on this document"
return True

assert doc_pred.tables is not None, "no tables predicted, but expected in doc_true"

assert len(doc_true.tables) == len(
doc_pred.tables
), "document has different count of tables than expected."
Expand All @@ -82,6 +91,8 @@ def verify_tables(doc_pred: DsDocument, doc_true: DsDocument):
true_item.num_cols == pred_item.num_cols
), "table does not have the same #-cols"

assert true_item.data is not None, "documents are expected to have table data"
assert pred_item.data is not None, "documents are expected to have table data"
for i, row in enumerate(true_item.data):
for j, col in enumerate(true_item.data[i]):

Expand Down Expand Up @@ -135,7 +146,7 @@ def verify_conversion_result(
doc_true_pages = PageList.validate_json(fr.read())

with open(json_path, "r") as fr:
doc_true = DsDocument.model_validate_json(fr.read())
doc_true: DsDocument = DsDocument.model_validate_json(fr.read())

with open(md_path, "r") as fr:
doc_true_md = fr.read()
Expand Down

0 comments on commit 8aa476c

Please sign in to comment.