diff --git a/docling/backend/abstract_backend.py b/docling/backend/abstract_backend.py index 66df2869..22fdc1b2 100644 --- a/docling/backend/abstract_backend.py +++ b/docling/backend/abstract_backend.py @@ -3,10 +3,11 @@ from pathlib import Path from typing import TYPE_CHECKING, Any, Iterable, Optional, Union +from docling_core.types.experimental.base import BoundingBox, Size from PIL import Image if TYPE_CHECKING: - from docling.datamodel.base_models import BoundingBox, Cell, PageSize + from docling.datamodel.base_models import Cell class PdfPageBackend(ABC): @@ -30,7 +31,7 @@ def get_page_image( pass @abstractmethod - def get_size(self) -> "PageSize": + def get_size(self) -> "Size": pass @abstractmethod diff --git a/docling/backend/docling_parse_backend.py b/docling/backend/docling_parse_backend.py index d7a116d4..94f2dfc5 100644 --- a/docling/backend/docling_parse_backend.py +++ b/docling/backend/docling_parse_backend.py @@ -5,12 +5,13 @@ from typing import Iterable, List, Optional, Union import pypdfium2 as pdfium +from docling_core.types.experimental.base import BoundingBox, CoordOrigin, Size from docling_parse.docling_parse import pdf_parser from PIL import Image, ImageDraw from pypdfium2 import PdfPage from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend -from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize +from docling.datamodel.base_models import Cell _log = logging.getLogger(__name__) @@ -177,8 +178,8 @@ def get_page_image( return image - def get_size(self) -> PageSize: - return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height()) + def get_size(self) -> Size: + return Size(width=self._ppage.get_width(), height=self._ppage.get_height()) def unload(self): self._ppage = None diff --git a/docling/backend/pypdfium2_backend.py b/docling/backend/pypdfium2_backend.py index 81ab8488..2c128357 100644 --- a/docling/backend/pypdfium2_backend.py +++ b/docling/backend/pypdfium2_backend.py @@ -6,12 +6,13 @@ import pypdfium2 as pdfium import pypdfium2.raw as pdfium_c +from docling_core.types.experimental.base import BoundingBox, CoordOrigin, Size from PIL import Image, ImageDraw from pypdfium2 import PdfPage, PdfTextPage from pypdfium2._helpers.misc import PdfiumError from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend -from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize +from docling.datamodel.base_models import Cell _log = logging.getLogger(__name__) @@ -222,8 +223,8 @@ def get_page_image( return image - def get_size(self) -> PageSize: - return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height()) + def get_size(self) -> Size: + return Size(width=self._ppage.get_width(), height=self._ppage.get_height()) def unload(self): self._ppage = None diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index e9c51d69..5a04cb12 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -4,6 +4,7 @@ from io import BytesIO from typing import Annotated, Any, Dict, List, Optional, Tuple, Union +from docling_core.types.experimental.base import BoundingBox, Size from PIL.Image import Image from pydantic import BaseModel, ConfigDict, Field, model_validator from typing_extensions import Self @@ -24,11 +25,6 @@ class DocInputType(str, Enum): STREAM = auto() -class CoordOrigin(str, Enum): - TOPLEFT = auto() - BOTTOMLEFT = auto() - - class DoclingComponentType(str, Enum): PDF_BACKEND = auto() MODEL = auto() @@ -41,115 +37,6 @@ class ErrorItem(BaseModel): error_message: str -class PageSize(BaseModel): - width: float = 0.0 - height: float = 0.0 - - -class BoundingBox(BaseModel): - l: float # left - t: float # top - r: float # right - b: float # bottom - - coord_origin: CoordOrigin = CoordOrigin.TOPLEFT - - @property - def width(self): - return self.r - self.l - - @property - def height(self): - return abs(self.t - self.b) - - def scaled(self, scale: float) -> "BoundingBox": - out_bbox = copy.deepcopy(self) - out_bbox.l *= scale - out_bbox.r *= scale - out_bbox.t *= scale - out_bbox.b *= scale - - return out_bbox - - def normalized(self, page_size: PageSize) -> "BoundingBox": - out_bbox = copy.deepcopy(self) - out_bbox.l /= page_size.width - out_bbox.r /= page_size.width - out_bbox.t /= page_size.height - out_bbox.b /= page_size.height - - return out_bbox - - def as_tuple(self): - if self.coord_origin == CoordOrigin.TOPLEFT: - return (self.l, self.t, self.r, self.b) - elif self.coord_origin == CoordOrigin.BOTTOMLEFT: - return (self.l, self.b, self.r, self.t) - - @classmethod - def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin): - if origin == CoordOrigin.TOPLEFT: - l, t, r, b = coord[0], coord[1], coord[2], coord[3] - if r < l: - l, r = r, l - if b < t: - b, t = t, b - - return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin) - elif origin == CoordOrigin.BOTTOMLEFT: - l, b, r, t = coord[0], coord[1], coord[2], coord[3] - if r < l: - l, r = r, l - if b > t: - b, t = t, b - - return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin) - - def area(self) -> float: - return (self.r - self.l) * (self.b - self.t) - - def intersection_area_with(self, other: "BoundingBox") -> float: - # Calculate intersection coordinates - left = max(self.l, other.l) - top = max(self.t, other.t) - right = min(self.r, other.r) - bottom = min(self.b, other.b) - - # Calculate intersection dimensions - width = right - left - height = bottom - top - - # If the bounding boxes do not overlap, width or height will be negative - if width <= 0 or height <= 0: - return 0.0 - - return width * height - - def to_bottom_left_origin(self, page_height) -> "BoundingBox": - if self.coord_origin == CoordOrigin.BOTTOMLEFT: - return self - elif self.coord_origin == CoordOrigin.TOPLEFT: - return BoundingBox( - l=self.l, - r=self.r, - t=page_height - self.t, - b=page_height - self.b, - coord_origin=CoordOrigin.BOTTOMLEFT, - ) - - def to_top_left_origin(self, page_height): - if self.coord_origin == CoordOrigin.TOPLEFT: - return self - elif self.coord_origin == CoordOrigin.BOTTOMLEFT: - return BoundingBox( - l=self.l, - r=self.r, - t=page_height - self.t, # self.b - b=page_height - self.b, # self.t - coord_origin=CoordOrigin.TOPLEFT, - ) - - class Cell(BaseModel): id: int text: str @@ -266,7 +153,7 @@ class Page(BaseModel): page_no: int page_hash: Optional[str] = None - size: Optional[PageSize] = None + size: Optional[Size] = None cells: List[Cell] = [] predictions: PagePredictions = PagePredictions() assembled: Optional[AssembledUnit] = None diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index b8177730..19fc40cd 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -4,13 +4,13 @@ from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union from docling_core.types import BaseCell, BaseText -from docling_core.types import BoundingBox as DsBoundingBox from docling_core.types import Document as DsDocument from docling_core.types import DocumentDescription as DsDocumentDescription from docling_core.types import FileInfoObject as DsFileInfoObject from docling_core.types import PageDimensions, PageReference, Prov, Ref from docling_core.types import Table as DsSchemaTable from docling_core.types import TableCell +from docling_core.types.doc.base import BoundingBox as DsBoundingBox from docling_core.types.doc.base import Figure from pydantic import BaseModel from typing_extensions import deprecated diff --git a/docling/models/base_ocr_model.py b/docling/models/base_ocr_model.py index 3b3c261e..8818517d 100644 --- a/docling/models/base_ocr_model.py +++ b/docling/models/base_ocr_model.py @@ -5,11 +5,12 @@ import numpy import numpy as np +from docling_core.types.experimental.base import BoundingBox, CoordOrigin from PIL import Image, ImageDraw from rtree import index from scipy.ndimage import find_objects, label -from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page +from docling.datamodel.base_models import OcrCell, Page _log = logging.getLogger(__name__) diff --git a/docling/models/ds_glm_model.py b/docling/models/ds_glm_model.py index 36f4f142..208fb9d1 100644 --- a/docling/models/ds_glm_model.py +++ b/docling/models/ds_glm_model.py @@ -7,9 +7,10 @@ from docling_core.types import BaseText from docling_core.types import Document as DsDocument from docling_core.types import Ref +from docling_core.types.experimental.base import BoundingBox, CoordOrigin from PIL import ImageDraw -from docling.datamodel.base_models import BoundingBox, Cluster, CoordOrigin +from docling.datamodel.base_models import Cluster from docling.datamodel.document import ConversionResult diff --git a/docling/models/easyocr_model.py b/docling/models/easyocr_model.py index 5fb4066b..b735f7ba 100644 --- a/docling/models/easyocr_model.py +++ b/docling/models/easyocr_model.py @@ -2,8 +2,9 @@ from typing import Iterable import numpy +from docling_core.types.experimental.base import BoundingBox, CoordOrigin -from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page +from docling.datamodel.base_models import OcrCell, Page from docling.models.base_ocr_model import BaseOcrModel _log = logging.getLogger(__name__) diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py index af7b8e7b..1e3e249c 100644 --- a/docling/models/layout_model.py +++ b/docling/models/layout_model.py @@ -4,6 +4,7 @@ import time from typing import Iterable, List +from docling_core.types.experimental.base import CoordOrigin from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor from PIL import ImageDraw @@ -11,7 +12,6 @@ BoundingBox, Cell, Cluster, - CoordOrigin, LayoutPrediction, Page, ) diff --git a/docling/models/table_structure_model.py b/docling/models/table_structure_model.py index 388a0f9e..78b727fc 100644 --- a/docling/models/table_structure_model.py +++ b/docling/models/table_structure_model.py @@ -2,11 +2,11 @@ from typing import Iterable, List import numpy +from docling_core.types.experimental.base import BoundingBox from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor from PIL import ImageDraw from docling.datamodel.base_models import ( - BoundingBox, Page, TableCell, TableElement, diff --git a/docling/utils/export.py b/docling/utils/export.py index e9e56930..3daa856b 100644 --- a/docling/utils/export.py +++ b/docling/utils/export.py @@ -1,9 +1,17 @@ import logging from typing import Any, Dict, Iterable, List, Tuple, Union -from docling_core.types.doc.base import BaseCell, BaseText, Ref, Table, TableCell - -from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell +from docling_core.types.doc.base import ( + BaseCell, + BaseText, + BoundingBox, + Ref, + Table, + TableCell, +) +from docling_core.types.experimental.base import CoordOrigin + +from docling.datamodel.base_models import OcrCell from docling.datamodel.document import ConversionResult, Page _log = logging.getLogger(__name__) diff --git a/poetry.lock b/poetry.lock index 075a5220..1ae91420 100644 --- a/poetry.lock +++ b/poetry.lock @@ -857,50 +857,33 @@ name = "deepsearch-glm" version = "0.21.1" description = "Graph Language Models" optional = false -python-versions = "<4.0,>=3.8" -files = [ - {file = "deepsearch_glm-0.21.1-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:b765d371ab0a4f57dd2532c651d7dc1b4a187395153e619a77b6f0d0f6aefb32"}, - {file = "deepsearch_glm-0.21.1-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:c69e055b98d0a22267a1d0b6139801aecc5b7386289b89f53f976ab723352728"}, - {file = "deepsearch_glm-0.21.1-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:3eaa245e5ac4ab3e9d0c95a93e23f58d61d70f11431b76b6705fae358eb31c62"}, - {file = "deepsearch_glm-0.21.1-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:63d195f6c5b30f4f908436589cffd4a5b9e18553c44c57fb635068a2afbd7fab"}, - {file = "deepsearch_glm-0.21.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:91c9296a2e417a30bf030de0c7c2e2cce4773c58bead039d5e6fccbf7deb2269"}, - {file = "deepsearch_glm-0.21.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:166b9958d3a8a98d0671a1e3fdf8083ded9ccf12c2ab80fb9709908a2cf81784"}, - {file = "deepsearch_glm-0.21.1-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:13bea2b4e8c04647ec743c3feb1ee66c784db542ab9dbed8dad7eb66fca74b70"}, - {file = "deepsearch_glm-0.21.1-cp311-cp311-macosx_13_0_x86_64.whl", hash = "sha256:c5b8b8e2207615ff99e535f00548c7b0b8e4ca4593e59edd83fcad98fc318284"}, - {file = "deepsearch_glm-0.21.1-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:ba74868243caf5ac850fff7c45c8a372c1cac0193431e22eb41888d45ac79719"}, - {file = "deepsearch_glm-0.21.1-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:7815b06aa1c3953488496f191ce0265d0ee7bed5a6b96454a5f9d6f1add28f69"}, - {file = "deepsearch_glm-0.21.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d1a7dd2a1e63cee47f6090ebfebc15f68d24f61d5f4f45a21f22120b2267798d"}, - {file = "deepsearch_glm-0.21.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d52bd2934a27fdc9db5f2d0713dbeec0c94e5c5843d29996e85d641a11498ad0"}, - {file = "deepsearch_glm-0.21.1-cp312-cp312-macosx_13_0_arm64.whl", hash = "sha256:fd4d0d4ff853e566b05769c704a4ea3c050c0cfc5721e4e2035e550fb2a8fe91"}, - {file = "deepsearch_glm-0.21.1-cp312-cp312-macosx_13_0_x86_64.whl", hash = "sha256:802a59a8a3bea1801bce848d58d19fcdbbcea27d9e2c23f163419d13cdec2345"}, - {file = "deepsearch_glm-0.21.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:1ead7958bc044000a8d43cce53c9b82be0d341b0ca5cf7b39a0c09f9c4fd8ceb"}, - {file = "deepsearch_glm-0.21.1-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:312cf2b0b6560c8dfe5331a5a80a0ed5cb409d29ee6cc999a81696774d50f5e7"}, - {file = "deepsearch_glm-0.21.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bc3d6f6ca2cffbe5e112818c8aba9a783af8ab7cffff04624bfb5bf8d185b707"}, - {file = "deepsearch_glm-0.21.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8bc537d5e9d108233b7e7249c6739292dc9c36a0f39c11e7f430700df35ff884"}, - {file = "deepsearch_glm-0.21.1-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:4db0a700c08ff2d6285461dc5f4a68ccd36876a59b62131f847dc4be76a85989"}, - {file = "deepsearch_glm-0.21.1-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:f1041c44d1a4d1a43a324781795b03edfdfd8076c49a610c4dd384c86f2a6236"}, - {file = "deepsearch_glm-0.21.1-cp39-cp39-macosx_14_0_arm64.whl", hash = "sha256:efb0e9678fe07640bd9b6dc07651eaf1f8e5d5602e379b4cf78dbcddc62b50e9"}, - {file = "deepsearch_glm-0.21.1-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:f8d46922d74339ec7fd7a6933220ebc36b2ff39738ad9bb74ea55a198dd31b2f"}, - {file = "deepsearch_glm-0.21.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2872de101ce6d262f57afd3f4d68452064c214c5ab001b7ac698a948e0725314"}, - {file = "deepsearch_glm-0.21.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:187da7dabc11317badbf6983ee508c367299eb39ed78938623206be6b21e41bd"}, -] +python-versions = "^3.9" +files = [] +develop = false [package.dependencies] +docling-core = {git = "ssh://git@github.com/DS4SD/docling-core.git", branch = "cau/new-format-dev"} docutils = "!=0.21" -matplotlib = ">=3.7.1,<4.0.0" -networkx = ">=3.1,<4.0" -netwulf = ">=0.1.5,<0.2.0" -numerize = ">=0.12,<0.13" -numpy = {version = ">=1.26.4,<2.0.0", markers = "python_version >= \"3.9\""} +matplotlib = "^3.7.1" +networkx = "^3.1" +netwulf = "^0.1.5" +numerize = "^0.12" +numpy = {version = "^1.26.4", markers = "python_version >= \"3.9\""} pandas = ">=1.5.1" -python-dotenv = ">=1.0.0,<2.0.0" -rich = ">=13.7.0,<14.0.0" +python-dotenv = "^1.0.0" +rich = "^13.7.0" tabulate = ">=0.8.9" -tqdm = ">=4.64.0,<5.0.0" +tqdm = "^4.64.0" [package.extras] toolkit = ["deepsearch-toolkit (>=0.31.0)"] +[package.source] +type = "git" +url = "ssh://git@github.com/DS4SD/deepsearch-glm.git" +reference = "cau/new-format-dev" +resolved_reference = "6d86b7ddaa8911ec57df9bbabf981a42166e53d2" + [[package]] name = "deprecated" version = "1.2.14" @@ -957,23 +940,27 @@ files = [ [[package]] name = "docling-core" -version = "1.4.0" +version = "1.4.1" description = "A python library to define and validate data types in Docling." optional = false -python-versions = "<4.0,>=3.9" -files = [ - {file = "docling_core-1.4.0-py3-none-any.whl", hash = "sha256:11cd6228d5f321fd11427cf61f40148afd544170e82236228794300f14f8a15a"}, - {file = "docling_core-1.4.0.tar.gz", hash = "sha256:6ea151974172a87a9bca0d63787dc16bdb4170ecb73f18e61e3c2e95eb3fe3d8"}, -] +python-versions = "^3.9" +files = [] +develop = false [package.dependencies] -json-schema-for-humans = ">=1.0.0,<2.0.0" -jsonref = ">=1.1.0,<2.0.0" -jsonschema = ">=4.16.0,<5.0.0" -pandas = ">=2.2.2,<3.0.0" -pydantic = ">=2.6.0,<3.0.0" -pyproject-toml = ">=0.0.10,<0.0.11" -tabulate = ">=0.9.0,<0.10.0" +json-schema-for-humans = "^1.0.0" +jsonref = "^1.1.0" +jsonschema = "^4.16.0" +pandas = "^2.2.2" +pydantic = "^2.6.0" +pyproject-toml = "^0.0.10" +tabulate = "^0.9.0" + +[package.source] +type = "git" +url = "ssh://git@github.com/DS4SD/docling-core.git" +reference = "cau/new-format-dev" +resolved_reference = "ed087646ec9ad86c5b54eb37d7b99322d03487f0" [[package]] name = "docling-ibm-models" @@ -4697,6 +4684,21 @@ files = [ [package.dependencies] six = ">=1.5" +[[package]] +name = "python-docx" +version = "1.1.2" +description = "Create, read, and update Microsoft Word .docx files." +optional = false +python-versions = ">=3.7" +files = [ + {file = "python_docx-1.1.2-py3-none-any.whl", hash = "sha256:08c20d6058916fb19853fcf080f7f42b6270d89eac9fa5f8c15f691c0017fabe"}, + {file = "python_docx-1.1.2.tar.gz", hash = "sha256:0cf1f22e95b9002addca7948e16f2cd7acdfd498047f1941ca5d293db7762efd"}, +] + +[package.dependencies] +lxml = ">=3.1.0" +typing-extensions = ">=4.9.0" + [[package]] name = "python-dotenv" version = "1.0.1" @@ -7257,4 +7259,4 @@ examples = ["langchain-huggingface", "langchain-milvus", "langchain-text-splitte [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "7dc789b3c981898fdabec03f85ebb92273f2bb55b2bf1e18dad1d4c361c6b97b" +content-hash = "1b908180d822d74ae8033e8b6c650b8d00b4365fc7dd36cea6505305651b79b6" diff --git a/pyproject.toml b/pyproject.toml index cd20fb64..f8dda78f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,9 +23,10 @@ packages = [{include = "docling"}] [tool.poetry.dependencies] python = "^3.10" pydantic = "^2.0.0" -docling-core = "^1.4.0" +docling-core = {git = "ssh://git@github.com/DS4SD/docling-core.git", branch = "cau/new-format-dev"} docling-ibm-models = "^1.2.0" -deepsearch-glm = "^0.21.1" +deepsearch-glm = {git = "ssh://git@github.com/DS4SD/deepsearch-glm.git", branch = "cau/new-format-dev"} + filetype = "^1.2.0" pypdfium2 = "^4.30.0" pydantic-settings = "^2.3.0" @@ -61,6 +62,7 @@ torchvision = [ {version = "~0.17.2", optional = true, markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'"} ] typer = "^0.12.5" +python-docx = "^1.1.2" [tool.poetry.group.dev.dependencies] black = {extras = ["jupyter"], version = "^24.4.2"} diff --git a/tests/test_backend_docling_parse.py b/tests/test_backend_docling_parse.py index f9442b05..01e7cbbb 100644 --- a/tests/test_backend_docling_parse.py +++ b/tests/test_backend_docling_parse.py @@ -1,12 +1,12 @@ from pathlib import Path import pytest +from docling_core.types.experimental.base import BoundingBox from docling.backend.docling_parse_backend import ( DoclingParseDocumentBackend, DoclingParsePageBackend, ) -from docling.datamodel.base_models import BoundingBox @pytest.fixture diff --git a/tests/test_backend_pdfium.py b/tests/test_backend_pdfium.py index 1fa35a0f..c3050b34 100644 --- a/tests/test_backend_pdfium.py +++ b/tests/test_backend_pdfium.py @@ -1,12 +1,12 @@ from pathlib import Path import pytest +from docling_core.types.experimental.base import BoundingBox from docling.backend.pypdfium2_backend import ( PyPdfiumDocumentBackend, PyPdfiumPageBackend, ) -from docling.datamodel.base_models import BoundingBox @pytest.fixture