diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index e5b49343..f8dec5cb 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -33,6 +33,7 @@ from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument from docling_core.utils.file import resolve_source_to_stream +from docling_core.utils.legacy import docling_document_to_legacy from pydantic import BaseModel from typing_extensions import deprecated @@ -189,259 +190,7 @@ class ConversionResult(BaseModel): @property @deprecated("Use document instead.") def legacy_document(self): - reverse_label_mapping = { - DocItemLabel.CAPTION.value: "Caption", - DocItemLabel.FOOTNOTE.value: "Footnote", - DocItemLabel.FORMULA.value: "Formula", - DocItemLabel.LIST_ITEM.value: "List-item", - DocItemLabel.PAGE_FOOTER.value: "Page-footer", - DocItemLabel.PAGE_HEADER.value: "Page-header", - DocItemLabel.PICTURE.value: "Picture", # low threshold adjust to capture chemical structures for examples. - DocItemLabel.SECTION_HEADER.value: "Section-header", - DocItemLabel.TABLE.value: "Table", - DocItemLabel.TEXT.value: "Text", - DocItemLabel.TITLE.value: "Title", - DocItemLabel.DOCUMENT_INDEX.value: "Document Index", - DocItemLabel.CODE.value: "Code", - DocItemLabel.CHECKBOX_SELECTED.value: "Checkbox-Selected", - DocItemLabel.CHECKBOX_UNSELECTED.value: "Checkbox-Unselected", - DocItemLabel.FORM.value: "Form", - DocItemLabel.KEY_VALUE_REGION.value: "Key-Value Region", - DocItemLabel.PARAGRAPH.value: "paragraph", - } - - title = "" - desc = DsDocumentDescription(logs=[]) - - page_hashes = [ - PageReference( - hash=create_hash(self.input.document_hash + ":" + str(p.page_no - 1)), - page=p.page_no, - model="default", - ) - for p in self.document.pages.values() - ] - - file_info = DsFileInfoObject( - filename=self.input.file.name, - document_hash=self.input.document_hash, - num_pages=self.input.page_count, - page_hashes=page_hashes, - ) - - main_text = [] - tables = [] - figures = [] - equations = [] - footnotes = [] - page_headers = [] - page_footers = [] - - embedded_captions = set() - for ix, (item, level) in enumerate( - self.document.iterate_items(self.document.body) - ): - - if isinstance(item, (TableItem, PictureItem)) and len(item.captions) > 0: - caption = item.caption_text(self.document) - if caption: - embedded_captions.add(caption) - - for item, level in self.document.iterate_items(): - if isinstance(item, DocItem): - item_type = item.label - - if isinstance(item, (TextItem, ListItem, SectionHeaderItem)): - - if isinstance(item, ListItem) and item.marker: - text = f"{item.marker} {item.text}" - else: - text = item.text - - # Can be empty. - prov = [ - Prov( - bbox=p.bbox.as_tuple(), - page=p.page_no, - span=[0, len(item.text)], - ) - for p in item.prov - ] - main_text.append( - BaseText( - text=text, - obj_type=layout_label_to_ds_type.get(item.label), - name=reverse_label_mapping[item.label], - prov=prov, - ) - ) - - # skip captions of they are embedded in the actual - # floating object - if item_type == DocItemLabel.CAPTION and text in embedded_captions: - continue - - elif isinstance(item, TableItem) and item.data: - index = len(tables) - ref_str = f"#/tables/{index}" - main_text.append( - Ref( - name=reverse_label_mapping[item.label], - obj_type=layout_label_to_ds_type.get(item.label), - ref=ref_str, - ), - ) - - # Initialise empty table data grid (only empty cells) - table_data = [ - [ - TableCell( - text="", - # bbox=[0,0,0,0], - spans=[[i, j]], - obj_type="body", - ) - for j in range(item.data.num_cols) - ] - for i in range(item.data.num_rows) - ] - - # Overwrite cells in table data for which there is actual cell content. - for cell in item.data.table_cells: - for i in range( - min(cell.start_row_offset_idx, item.data.num_rows), - min(cell.end_row_offset_idx, item.data.num_rows), - ): - for j in range( - min(cell.start_col_offset_idx, item.data.num_cols), - min(cell.end_col_offset_idx, item.data.num_cols), - ): - celltype = "body" - if cell.column_header: - celltype = "col_header" - elif cell.row_header: - celltype = "row_header" - elif cell.row_section: - celltype = "row_section" - - def make_spans(cell): - for rspan in range( - min( - cell.start_row_offset_idx, - item.data.num_rows, - ), - min( - cell.end_row_offset_idx, item.data.num_rows - ), - ): - for cspan in range( - min( - cell.start_col_offset_idx, - item.data.num_cols, - ), - min( - cell.end_col_offset_idx, - item.data.num_cols, - ), - ): - yield [rspan, cspan] - - spans = list(make_spans(cell)) - table_data[i][j] = GlmTableCell( - text=cell.text, - bbox=( - cell.bbox.as_tuple() - if cell.bbox is not None - else None - ), # check if this is bottom-left - spans=spans, - obj_type=celltype, - col=j, - row=i, - row_header=cell.row_header, - row_section=cell.row_section, - col_header=cell.column_header, - row_span=[ - cell.start_row_offset_idx, - cell.end_row_offset_idx, - ], - col_span=[ - cell.start_col_offset_idx, - cell.end_col_offset_idx, - ], - ) - - # Compute the caption - caption = item.caption_text(self.document) - - tables.append( - DsSchemaTable( - text=caption, - num_cols=item.data.num_cols, - num_rows=item.data.num_rows, - obj_type=layout_label_to_ds_type.get(item.label), - data=table_data, - prov=[ - Prov( - bbox=p.bbox.as_tuple(), - page=p.page_no, - span=[0, 0], - ) - for p in item.prov - ], - ) - ) - - elif isinstance(item, PictureItem): - index = len(figures) - ref_str = f"#/figures/{index}" - main_text.append( - Ref( - name=reverse_label_mapping[item.label], - obj_type=layout_label_to_ds_type.get(item.label), - ref=ref_str, - ), - ) - - # Compute the caption - caption = item.caption_text(self.document) - - figures.append( - Figure( - prov=[ - Prov( - bbox=p.bbox.as_tuple(), - page=p.page_no, - span=[0, len(caption)], - ) - for p in item.prov - ], - obj_type=layout_label_to_ds_type.get(item.label), - text=caption, - # data=[[]], - ) - ) - - page_dimensions = [ - PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width) - for p in self.document.pages.values() - ] - - ds_doc = DsDocument( - name=title, - description=desc, - file_info=file_info, - main_text=main_text, - equations=equations, - footnotes=footnotes, - page_headers=page_headers, - page_footers=page_footers, - tables=tables, - figures=figures, - page_dimensions=page_dimensions, - ) - - return ds_doc + return docling_document_to_legacy(self.document) class _DummyBackend(AbstractDocumentBackend): diff --git a/poetry.lock b/poetry.lock index 67ff0fee..a09e995a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -888,13 +888,13 @@ files = [ [[package]] name = "docling-core" -version = "2.8.0" +version = "2.9.0" description = "A python library to define and validate data types in Docling." optional = false python-versions = "<4.0,>=3.9" files = [ - {file = "docling_core-2.8.0-py3-none-any.whl", hash = "sha256:392aad49e25f5fd1d279410118fbd91d9aaab9dd92d043738d20c10c57193d86"}, - {file = "docling_core-2.8.0.tar.gz", hash = "sha256:6ac5cbc6f0abcbdf599c2a4b1a3f7b52fd8baebf3c4ebf94d7b7e2ee061a654e"}, + {file = "docling_core-2.9.0-py3-none-any.whl", hash = "sha256:b44b077db5d2ac8a900f30a15abe329c165b1f2eb7f1c90d1275c423c1c3d668"}, + {file = "docling_core-2.9.0.tar.gz", hash = "sha256:1bf12fe67ee4852330e9bac33fe62b45598ff885481e03a88fa8e1bf48252424"}, ] [package.dependencies] @@ -6061,6 +6061,11 @@ files = [ {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f60021ec1574e56632be2a36b946f8143bf4e5e6af4a06d85281adc22938e0dd"}, {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:394397841449853c2290a32050382edaec3da89e35b3e03d6cc966aebc6a8ae6"}, {file = "scikit_learn-1.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:57cc1786cfd6bd118220a92ede80270132aa353647684efa385a74244a41e3b1"}, + {file = "scikit_learn-1.5.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9a702e2de732bbb20d3bad29ebd77fc05a6b427dc49964300340e4c9328b3f5"}, + {file = "scikit_learn-1.5.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:b0768ad641981f5d3a198430a1d31c3e044ed2e8a6f22166b4d546a5116d7908"}, + {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:178ddd0a5cb0044464fc1bfc4cca5b1833bfc7bb022d70b05db8530da4bb3dd3"}, + {file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7284ade780084d94505632241bf78c44ab3b6f1e8ccab3d2af58e0e950f9c12"}, + {file = "scikit_learn-1.5.2-cp313-cp313-win_amd64.whl", hash = "sha256:b7b0f9a0b1040830d38c39b91b3a44e1b643f4b36e36567b80b7c6bd2202a27f"}, {file = "scikit_learn-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:757c7d514ddb00ae249832fe87100d9c73c6ea91423802872d9e74970a0e40b9"}, {file = "scikit_learn-1.5.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:52788f48b5d8bca5c0736c175fa6bdaab2ef00a8f536cda698db61bd89c551c1"}, {file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:643964678f4b5fbdc95cbf8aec638acc7aa70f5f79ee2cdad1eec3df4ba6ead8"}, @@ -7597,4 +7602,4 @@ tesserocr = ["tesserocr"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "621f8de238fd1f82cfd783531b6ab7c1598378a499c0dcfac323d66bc7ab32ea" +content-hash = "3e66a54bd0433581e4909003124e2b79b42bdd1fb90d17c037f3294aeff56aa9" diff --git a/pyproject.toml b/pyproject.toml index b2593d77..649f07b2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ packages = [{include = "docling"}] # actual dependencies: ###################### python = "^3.9" -docling-core = { version = "^2.8.0", extras = ["chunking"] } +docling-core = { version = "^2.9.0", extras = ["chunking"] } pydantic = "^2.0.0" docling-ibm-models = "^2.0.6" deepsearch-glm = "^1.0.0"