Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Establish DoclingDocument format (experimental) #91

Closed
wants to merge 18 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions docling/backend/abstract_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@
from pathlib import Path
from typing import TYPE_CHECKING, Any, Iterable, Optional, Union

from docling_core.types.experimental import BoundingBox, Size
from PIL import Image

if TYPE_CHECKING:
from docling.datamodel.base_models import BoundingBox, Cell, PageSize
from docling.datamodel.base_models import Cell


class PdfPageBackend(ABC):
Expand All @@ -30,7 +31,7 @@ def get_page_image(
pass

@abstractmethod
def get_size(self) -> "PageSize":
def get_size(self) -> "Size":
pass

@abstractmethod
Expand Down
7 changes: 4 additions & 3 deletions docling/backend/docling_parse_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,13 @@
from typing import Iterable, List, Optional, Union

import pypdfium2 as pdfium
from docling_core.types.experimental import BoundingBox, CoordOrigin, Size
from docling_parse.docling_parse import pdf_parser
from PIL import Image, ImageDraw
from pypdfium2 import PdfPage

from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
from docling.datamodel.base_models import Cell

_log = logging.getLogger(__name__)

Expand Down Expand Up @@ -177,8 +178,8 @@ def get_page_image(

return image

def get_size(self) -> PageSize:
return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
def get_size(self) -> Size:
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())

def unload(self):
self._ppage = None
Expand Down
7 changes: 4 additions & 3 deletions docling/backend/pypdfium2_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@

import pypdfium2 as pdfium
import pypdfium2.raw as pdfium_c
from docling_core.types.experimental import BoundingBox, CoordOrigin, Size
from PIL import Image, ImageDraw
from pypdfium2 import PdfPage, PdfTextPage
from pypdfium2._helpers.misc import PdfiumError

from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
from docling.datamodel.base_models import Cell

_log = logging.getLogger(__name__)

Expand Down Expand Up @@ -222,8 +223,8 @@ def get_page_image(

return image

def get_size(self) -> PageSize:
return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
def get_size(self) -> Size:
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())

def unload(self):
self._ppage = None
Expand Down
165 changes: 10 additions & 155 deletions docling/datamodel/base_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
from io import BytesIO
from typing import Annotated, Any, Dict, List, Optional, Tuple, Union

from docling_core.types.experimental import BoundingBox, Size
from docling_core.types.experimental.document import BasePictureData, TableCell
from docling_core.types.experimental.labels import DocItemLabel
from PIL.Image import Image
from pydantic import BaseModel, ConfigDict, Field, model_validator
from typing_extensions import Self
Expand All @@ -24,11 +27,6 @@ class DocInputType(str, Enum):
STREAM = auto()


class CoordOrigin(str, Enum):
TOPLEFT = auto()
BOTTOMLEFT = auto()


class DoclingComponentType(str, Enum):
PDF_BACKEND = auto()
MODEL = auto()
Expand All @@ -41,115 +39,6 @@ class ErrorItem(BaseModel):
error_message: str


class PageSize(BaseModel):
width: float = 0.0
height: float = 0.0


class BoundingBox(BaseModel):
l: float # left
t: float # top
r: float # right
b: float # bottom

coord_origin: CoordOrigin = CoordOrigin.TOPLEFT

@property
def width(self):
return self.r - self.l

@property
def height(self):
return abs(self.t - self.b)

def scaled(self, scale: float) -> "BoundingBox":
out_bbox = copy.deepcopy(self)
out_bbox.l *= scale
out_bbox.r *= scale
out_bbox.t *= scale
out_bbox.b *= scale

return out_bbox

def normalized(self, page_size: PageSize) -> "BoundingBox":
out_bbox = copy.deepcopy(self)
out_bbox.l /= page_size.width
out_bbox.r /= page_size.width
out_bbox.t /= page_size.height
out_bbox.b /= page_size.height

return out_bbox

def as_tuple(self):
if self.coord_origin == CoordOrigin.TOPLEFT:
return (self.l, self.t, self.r, self.b)
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
return (self.l, self.b, self.r, self.t)

@classmethod
def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
if origin == CoordOrigin.TOPLEFT:
l, t, r, b = coord[0], coord[1], coord[2], coord[3]
if r < l:
l, r = r, l
if b < t:
b, t = t, b

return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
elif origin == CoordOrigin.BOTTOMLEFT:
l, b, r, t = coord[0], coord[1], coord[2], coord[3]
if r < l:
l, r = r, l
if b > t:
b, t = t, b

return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)

def area(self) -> float:
return (self.r - self.l) * (self.b - self.t)

def intersection_area_with(self, other: "BoundingBox") -> float:
# Calculate intersection coordinates
left = max(self.l, other.l)
top = max(self.t, other.t)
right = min(self.r, other.r)
bottom = min(self.b, other.b)

# Calculate intersection dimensions
width = right - left
height = bottom - top

# If the bounding boxes do not overlap, width or height will be negative
if width <= 0 or height <= 0:
return 0.0

return width * height

def to_bottom_left_origin(self, page_height) -> "BoundingBox":
if self.coord_origin == CoordOrigin.BOTTOMLEFT:
return self
elif self.coord_origin == CoordOrigin.TOPLEFT:
return BoundingBox(
l=self.l,
r=self.r,
t=page_height - self.t,
b=page_height - self.b,
coord_origin=CoordOrigin.BOTTOMLEFT,
)

def to_top_left_origin(self, page_height):
if self.coord_origin == CoordOrigin.TOPLEFT:
return self
elif self.coord_origin == CoordOrigin.BOTTOMLEFT:
return BoundingBox(
l=self.l,
r=self.r,
t=page_height - self.t, # self.b
b=page_height - self.b, # self.t
coord_origin=CoordOrigin.TOPLEFT,
)


class Cell(BaseModel):
id: int
text: str
Expand All @@ -162,14 +51,14 @@ class OcrCell(Cell):

class Cluster(BaseModel):
id: int
label: str
label: DocItemLabel
bbox: BoundingBox
confidence: float = 1.0
cells: List[Cell] = []


class BasePageElement(BaseModel):
label: str
label: DocItemLabel
id: int
page_no: int
cluster: Cluster
Expand All @@ -180,56 +69,22 @@ class LayoutPrediction(BaseModel):
clusters: List[Cluster] = []


class TableCell(BaseModel):
bbox: BoundingBox
row_span: int
col_span: int
start_row_offset_idx: int
end_row_offset_idx: int
start_col_offset_idx: int
end_col_offset_idx: int
text: str
column_header: bool = False
row_header: bool = False
row_section: bool = False

@model_validator(mode="before")
@classmethod
def from_dict_format(cls, data: Any) -> Any:
if isinstance(data, Dict):
text = data["bbox"].get("token", "")
if not len(text):
text_cells = data.pop("text_cell_bboxes", None)
if text_cells:
for el in text_cells:
text += el["token"] + " "

text = text.strip()
data["text"] = text

return data


class TableElement(BasePageElement):
class Table(BasePageElement):
otsl_seq: List[str]
num_rows: int = 0
num_cols: int = 0
table_cells: List[TableCell]


class TableStructurePrediction(BaseModel):
table_map: Dict[int, TableElement] = {}
table_map: Dict[int, Table] = {}


class TextElement(BasePageElement): ...


class FigureData(BaseModel):
pass


class FigureElement(BasePageElement):
data: Optional[FigureData] = None
data: Optional[BasePictureData] = None
provenance: Optional[str] = None
predicted_class: Optional[str] = None
confidence: Optional[float] = None
Expand All @@ -252,7 +107,7 @@ class PagePredictions(BaseModel):
equations_prediction: Optional[EquationPrediction] = None


PageElement = Union[TextElement, TableElement, FigureElement]
PageElement = Union[TextElement, Table, FigureElement]


class AssembledUnit(BaseModel):
Expand All @@ -266,7 +121,7 @@ class Page(BaseModel):

page_no: int
page_hash: Optional[str] = None
size: Optional[PageSize] = None
size: Optional[Size] = None
cells: List[Cell] = []
predictions: PagePredictions = PagePredictions()
assembled: Optional[AssembledUnit] = None
Expand Down
Loading
Loading