Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Establish DoclingDocument format (experimental) #91

Closed
wants to merge 18 commits into from
Closed
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Switch everything to use label enum, and more
Signed-off-by: Christoph Auer <[email protected]>
cau-git committed Sep 24, 2024
commit 33373ac0dd4b67e60dedd8bf720d2be647444d72
5 changes: 3 additions & 2 deletions docling/datamodel/base_models.py
Original file line number Diff line number Diff line change
@@ -6,6 +6,7 @@

from docling_core.types.experimental.base import BoundingBox, Size
from docling_core.types.experimental.document import BaseFigureData, TableCell
from docling_core.types.experimental.labels import PageLabel
from PIL.Image import Image
from pydantic import BaseModel, ConfigDict, Field, model_validator
from typing_extensions import Self
@@ -50,14 +51,14 @@ class OcrCell(Cell):

class Cluster(BaseModel):
id: int
label: str
label: PageLabel
bbox: BoundingBox
confidence: float = 1.0
cells: List[Cell] = []


class BasePageElement(BaseModel):
label: str
label: PageLabel
id: int
page_no: int
cluster: Cluster
31 changes: 16 additions & 15 deletions docling/datamodel/document.py
Original file line number Diff line number Diff line change
@@ -12,6 +12,7 @@
from docling_core.types.doc.base import BoundingBox as DsBoundingBox
from docling_core.types.doc.base import Figure, TableCell
from docling_core.types.experimental.document import DoclingDocument, FileInfo
from docling_core.types.experimental.labels import PageLabel
from pydantic import BaseModel
from typing_extensions import deprecated

@@ -34,21 +35,21 @@
_log = logging.getLogger(__name__)

layout_label_to_ds_type = {
"Title": "title",
"Document Index": "table-of-path_or_stream",
"Section-header": "subtitle-level-1",
"Checkbox-Selected": "checkbox-selected",
"Checkbox-Unselected": "checkbox-unselected",
"Caption": "caption",
"Page-header": "page-header",
"Page-footer": "page-footer",
"Footnote": "footnote",
"Table": "table",
"Formula": "equation",
"List-item": "paragraph",
"Code": "paragraph",
"Picture": "figure",
"Text": "paragraph",
PageLabel.TITLE: "title",
PageLabel.DOCUMENT_INDEX: "table-of-contents",
PageLabel.SECTION_HEADER: "subtitle-level-1",
PageLabel.CHECKBOX_SELECTED: "checkbox-selected",
PageLabel.CHECKBOX_UNSELECTED: "checkbox-unselected",
PageLabel.CAPTION: "caption",
PageLabel.PAGE_HEADER: "page-header",
PageLabel.PAGE_FOOTER: "page-footer",
PageLabel.FOOTNOTE: "footnote",
PageLabel.TABLE: "table",
PageLabel.FORMULA: "equation",
PageLabel.LIST_ITEM: "paragraph",
PageLabel.CODE: "paragraph",
PageLabel.PICTURE: "figure",
PageLabel.TEXT: "paragraph",
}

_EMPTY_DOC = DsDocument(
72 changes: 38 additions & 34 deletions docling/models/layout_model.py
Original file line number Diff line number Diff line change
@@ -5,6 +5,7 @@
from typing import Iterable, List

from docling_core.types.experimental.base import CoordOrigin
from docling_core.types.experimental.labels import PageLabel
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
from PIL import ImageDraw

@@ -23,23 +24,23 @@
class LayoutModel:

TEXT_ELEM_LABELS = [
"Text",
"Footnote",
"Caption",
"Checkbox-Unselected",
"Checkbox-Selected",
"Section-header",
"Page-header",
"Page-footer",
"Code",
"List-item",
PageLabel.TEXT,
PageLabel.FOOTNOTE,
PageLabel.CAPTION,
PageLabel.CHECKBOX_UNSELECTED,
PageLabel.CHECKBOX_SELECTED,
PageLabel.SECTION_HEADER,
PageLabel.PAGE_HEADER,
PageLabel.PAGE_FOOTER,
PageLabel.CODE,
PageLabel.LIST_ITEM,
# "Formula",
]
PAGE_HEADER_LABELS = ["Page-header", "Page-footer"]
PAGE_HEADER_LABELS = [PageLabel.PAGE_HEADER, PageLabel.PAGE_FOOTER]

TABLE_LABEL = "Table"
FIGURE_LABEL = "Picture"
FORMULA_LABEL = "Formula"
TABLE_LABEL = PageLabel.TABLE
FIGURE_LABEL = PageLabel.PICTURE
FORMULA_LABEL = PageLabel.FORMULA

def __init__(self, config):
self.config = config
@@ -50,27 +51,27 @@ def __init__(self, config):
def postprocess(self, clusters: List[Cluster], cells: List[Cell], page_height):
MIN_INTERSECTION = 0.2
CLASS_THRESHOLDS = {
"Caption": 0.35,
"Footnote": 0.35,
"Formula": 0.35,
"List-item": 0.35,
"Page-footer": 0.35,
"Page-header": 0.35,
"Picture": 0.2, # low threshold adjust to capture chemical structures for examples.
"Section-header": 0.45,
"Table": 0.35,
"Text": 0.45,
"Title": 0.45,
"Document Index": 0.45,
"Code": 0.45,
"Checkbox-Selected": 0.45,
"Checkbox-Unselected": 0.45,
"Form": 0.45,
"Key-Value Region": 0.45,
PageLabel.CAPTION: 0.35,
PageLabel.FOOTNOTE: 0.35,
PageLabel.FORMULA: 0.35,
PageLabel.LIST_ITEM: 0.35,
PageLabel.PAGE_FOOTER: 0.35,
PageLabel.PAGE_HEADER: 0.35,
PageLabel.PICTURE: 0.2, # low threshold adjust to capture chemical structures for examples.
PageLabel.SECTION_HEADER: 0.45,
PageLabel.TABLE: 0.35,
PageLabel.TEXT: 0.45,
PageLabel.TITLE: 0.45,
PageLabel.DOCUMENT_INDEX: 0.45,
PageLabel.CODE: 0.45,
PageLabel.CHECKBOX_SELECTED: 0.45,
PageLabel.CHECKBOX_UNSELECTED: 0.45,
PageLabel.FORM: 0.45,
PageLabel.KEY_VALUE_REGION: 0.45,
}

CLASS_REMAPPINGS = {
"Document Index": "Table",
PageLabel.DOCUMENT_INDEX: PageLabel.TABLE,
}

_log.debug("================= Start postprocess function ====================")
@@ -257,7 +258,7 @@ def postprocess(self, clusters: List[Cluster], cells: List[Cell], page_height):
coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT
).to_top_left_origin(page_height),
confidence=c["confidence"],
label=c["type"],
label=PageLabel(c["type"]),
cells=cluster_cells,
)
clusters_out_new.append(c_new)
@@ -270,9 +271,12 @@ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
for ix, pred_item in enumerate(
self.layout_predictor.predict(page.get_image(scale=1.0))
):
label = PageLabel(
pred_item["label"].lower().replace(" ", "_").replace("-", "_")
) # Temporary, until docling-ibm-model uses docling-core types
cluster = Cluster(
id=ix,
label=pred_item["label"],
label=label,
confidence=pred_item["confidence"],
bbox=BoundingBox.model_validate(pred_item),
cells=[],
26 changes: 15 additions & 11 deletions docling/utils/layout_utils.py
Original file line number Diff line number Diff line change
@@ -2,6 +2,7 @@
import logging

import networkx as nx
from docling_core.types.experimental.labels import PageLabel

logger = logging.getLogger("layout_utils")

@@ -370,7 +371,7 @@ def adapt_bboxes(raw_cells, clusters, orphan_cell_indices):
"Treating cluster " + str(ix) + ", type " + str(new_cluster["type"])
)
logger.debug(" with cells: " + str(new_cluster["cell_ids"]))
if len(cluster["cell_ids"]) == 0 and cluster["type"] != "Picture":
if len(cluster["cell_ids"]) == 0 and cluster["type"] != PageLabel.PICTURE:
logger.debug(" Empty non-picture, removed")
continue ## Skip this former cluster, now without cells.
new_bbox = adapt_bbox(raw_cells, new_cluster, orphan_cell_indices)
@@ -380,14 +381,14 @@ def adapt_bboxes(raw_cells, clusters, orphan_cell_indices):


def adapt_bbox(raw_cells, cluster, orphan_cell_indices):
if not (cluster["type"] in ["Table", "Picture"]):
if not (cluster["type"] in [PageLabel.TABLE, PageLabel.PICTURE]):
## A text-like cluster. The bbox only needs to be around the text cells:
logger.debug(" Initial bbox: " + str(cluster["bbox"]))
new_bbox = surrounding_list(
[raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]]
)
logger.debug(" New bounding box:" + str(new_bbox))
if cluster["type"] == "Picture":
if cluster["type"] == PageLabel.PICTURE:
## We only make the bbox completely comprise included text cells:
logger.debug(" Picture")
if len(cluster["cell_ids"]) != 0:
@@ -587,7 +588,7 @@ def set_orphan_as_text(
max_id = -1
figures = []
for cluster in cluster_predictions:
if cluster["type"] == "Picture":
if cluster["type"] == PageLabel.PICTURE:
figures.append(cluster)

if cluster["id"] > max_id:
@@ -638,13 +639,13 @@ def set_orphan_as_text(
# if fig_flag == False and raw_cells[orph_id]["text"] not in line_orphans:
if fig_flag == False and lines_detector == False:
# get class from low confidence detections if not set as text:
class_type = "Text"
class_type = PageLabel.TEXT

for cluster in cluster_predictions_low:
intersection = compute_intersection(
orph_cell["bbox"], cluster["bbox"]
)
class_type = "Text"
class_type = PageLabel.TEXT
if (
cluster["confidence"] > 0.1
and bb_iou(cluster["bbox"], orph_cell["bbox"]) > 0.4
@@ -718,7 +719,7 @@ def merge_cells(cluster_predictions):
if cluster["id"] == node:
lines.append(cluster)
cluster_predictions.remove(cluster)
new_merged_cluster = build_cluster_from_lines(lines, "Text", max_id)
new_merged_cluster = build_cluster_from_lines(lines, PageLabel.TEXT, max_id)
cluster_predictions.append(new_merged_cluster)
return cluster_predictions

@@ -753,9 +754,9 @@ def clean_up_clusters(
# remove clusters that might appear inside tables, or images (such as pdf cells in graphs)
elif img_table == True:
if (
cluster_1["type"] == "Text"
and cluster_2["type"] == "Picture"
or cluster_2["type"] == "Table"
cluster_1["type"] == PageLabel.TEXT
and cluster_2["type"] == PageLabel.PICTURE
or cluster_2["type"] == PageLabel.TABLE
):
if bb_iou(cluster_1["bbox"], cluster_2["bbox"]) > 0.5:
DuplicateDeletedClusterIDs.append(cluster_1["id"])
@@ -771,7 +772,10 @@ def clean_up_clusters(
DuplicateDeletedClusterIDs.append(cluster_1["id"])
# remove tables that have one pdf cell
if one_cell_table == True:
if cluster_1["type"] == "Table" and len(cluster_1["cell_ids"]) < 2:
if (
cluster_1["type"] == PageLabel.TABLE
and len(cluster_1["cell_ids"]) < 2
):
DuplicateDeletedClusterIDs.append(cluster_1["id"])

DuplicateDeletedClusterIDs = list(set(DuplicateDeletedClusterIDs))
8 changes: 8 additions & 0 deletions examples/batch_convert.py
Original file line number Diff line number Diff line change
@@ -48,6 +48,14 @@ def export_documents(
)
)

# Export Docling document format to doctags (experimental):
with (output_dir / f"{doc_filename}.experimental.doctags").open("w") as fp:
fp.write(conv_res.experimental.export_to_document_tokens())

# Export Docling document format to markdown (experimental):
with (output_dir / f"{doc_filename}.experimental.md").open("w") as fp:
fp.write(conv_res.experimental.export_to_markdown())

# Export Text format:
with (output_dir / f"{doc_filename}.txt").open("w") as fp:
fp.write(conv_res.render_as_text())
14 changes: 7 additions & 7 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -23,7 +23,7 @@ packages = [{include = "docling"}]
[tool.poetry.dependencies]
python = "^3.10"
pydantic = "^2.0.0"
docling-core = {git = "ssh://[email protected]/DS4SD/docling-core.git", branch = "cau/new-format-dev"}
docling-core = {git = "ssh://[email protected]/DS4SD/docling-core.git", rev = "a83ff0056138d83ac2cb52bfb2ab1728ff86972f"}
docling-ibm-models = "^1.2.0"
deepsearch-glm = {git = "ssh://[email protected]/DS4SD/deepsearch-glm.git", branch = "cau/new-format-dev"}