Switch everything to use label enum, and more

Signed-off-by: Christoph Auer <[email protected]>
DS4SD · cau-git · Sep 20, 2024 · Sep 20, 2024 · Sep 23, 2024 · Sep 23, 2024
commit 33373ac0dd4b67e60dedd8bf720d2be647444d72
diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py
@@ -6,6 +6,7 @@
 
 from docling_core.types.experimental.base import BoundingBox, Size
 from docling_core.types.experimental.document import BaseFigureData, TableCell
+from docling_core.types.experimental.labels import PageLabel
 from PIL.Image import Image
 from pydantic import BaseModel, ConfigDict, Field, model_validator
 from typing_extensions import Self
@@ -50,14 +51,14 @@ class OcrCell(Cell):
 
 class Cluster(BaseModel):
     id: int
-    label: str
+    label: PageLabel
     bbox: BoundingBox
     confidence: float = 1.0
     cells: List[Cell] = []
 
 
 class BasePageElement(BaseModel):
-    label: str
+    label: PageLabel
     id: int
     page_no: int
     cluster: Cluster

diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py
@@ -12,6 +12,7 @@
 from docling_core.types.doc.base import BoundingBox as DsBoundingBox
 from docling_core.types.doc.base import Figure, TableCell
 from docling_core.types.experimental.document import DoclingDocument, FileInfo
+from docling_core.types.experimental.labels import PageLabel
 from pydantic import BaseModel
 from typing_extensions import deprecated
 
@@ -34,21 +35,21 @@
 _log = logging.getLogger(__name__)
 
 layout_label_to_ds_type = {
-    "Title": "title",
-    "Document Index": "table-of-path_or_stream",
-    "Section-header": "subtitle-level-1",
-    "Checkbox-Selected": "checkbox-selected",
-    "Checkbox-Unselected": "checkbox-unselected",
-    "Caption": "caption",
-    "Page-header": "page-header",
-    "Page-footer": "page-footer",
-    "Footnote": "footnote",
-    "Table": "table",
-    "Formula": "equation",
-    "List-item": "paragraph",
-    "Code": "paragraph",
-    "Picture": "figure",
-    "Text": "paragraph",
+    PageLabel.TITLE: "title",
+    PageLabel.DOCUMENT_INDEX: "table-of-contents",
+    PageLabel.SECTION_HEADER: "subtitle-level-1",
+    PageLabel.CHECKBOX_SELECTED: "checkbox-selected",
+    PageLabel.CHECKBOX_UNSELECTED: "checkbox-unselected",
+    PageLabel.CAPTION: "caption",
+    PageLabel.PAGE_HEADER: "page-header",
+    PageLabel.PAGE_FOOTER: "page-footer",
+    PageLabel.FOOTNOTE: "footnote",
+    PageLabel.TABLE: "table",
+    PageLabel.FORMULA: "equation",
+    PageLabel.LIST_ITEM: "paragraph",
+    PageLabel.CODE: "paragraph",
+    PageLabel.PICTURE: "figure",
+    PageLabel.TEXT: "paragraph",
 }
 
 _EMPTY_DOC = DsDocument(

diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py
@@ -5,6 +5,7 @@
 from typing import Iterable, List
 
 from docling_core.types.experimental.base import CoordOrigin
+from docling_core.types.experimental.labels import PageLabel
 from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
 from PIL import ImageDraw
 
@@ -23,23 +24,23 @@
 class LayoutModel:
 
     TEXT_ELEM_LABELS = [
-        "Text",
-        "Footnote",
-        "Caption",
-        "Checkbox-Unselected",
-        "Checkbox-Selected",
-        "Section-header",
-        "Page-header",
-        "Page-footer",
-        "Code",
-        "List-item",
+        PageLabel.TEXT,
+        PageLabel.FOOTNOTE,
+        PageLabel.CAPTION,
+        PageLabel.CHECKBOX_UNSELECTED,
+        PageLabel.CHECKBOX_SELECTED,
+        PageLabel.SECTION_HEADER,
+        PageLabel.PAGE_HEADER,
+        PageLabel.PAGE_FOOTER,
+        PageLabel.CODE,
+        PageLabel.LIST_ITEM,
         # "Formula",
     ]
-    PAGE_HEADER_LABELS = ["Page-header", "Page-footer"]
+    PAGE_HEADER_LABELS = [PageLabel.PAGE_HEADER, PageLabel.PAGE_FOOTER]
 
-    TABLE_LABEL = "Table"
-    FIGURE_LABEL = "Picture"
-    FORMULA_LABEL = "Formula"
+    TABLE_LABEL = PageLabel.TABLE
+    FIGURE_LABEL = PageLabel.PICTURE
+    FORMULA_LABEL = PageLabel.FORMULA
 
     def __init__(self, config):
         self.config = config
@@ -50,27 +51,27 @@ def __init__(self, config):
     def postprocess(self, clusters: List[Cluster], cells: List[Cell], page_height):
         MIN_INTERSECTION = 0.2
         CLASS_THRESHOLDS = {
-            "Caption": 0.35,
-            "Footnote": 0.35,
-            "Formula": 0.35,
-            "List-item": 0.35,
-            "Page-footer": 0.35,
-            "Page-header": 0.35,
-            "Picture": 0.2,  # low threshold adjust to capture chemical structures for examples.
-            "Section-header": 0.45,
-            "Table": 0.35,
-            "Text": 0.45,
-            "Title": 0.45,
-            "Document Index": 0.45,
-            "Code": 0.45,
-            "Checkbox-Selected": 0.45,
-            "Checkbox-Unselected": 0.45,
-            "Form": 0.45,
-            "Key-Value Region": 0.45,
+            PageLabel.CAPTION: 0.35,
+            PageLabel.FOOTNOTE: 0.35,
+            PageLabel.FORMULA: 0.35,
+            PageLabel.LIST_ITEM: 0.35,
+            PageLabel.PAGE_FOOTER: 0.35,
+            PageLabel.PAGE_HEADER: 0.35,
+            PageLabel.PICTURE: 0.2,  # low threshold adjust to capture chemical structures for examples.
+            PageLabel.SECTION_HEADER: 0.45,
+            PageLabel.TABLE: 0.35,
+            PageLabel.TEXT: 0.45,
+            PageLabel.TITLE: 0.45,
+            PageLabel.DOCUMENT_INDEX: 0.45,
+            PageLabel.CODE: 0.45,
+            PageLabel.CHECKBOX_SELECTED: 0.45,
+            PageLabel.CHECKBOX_UNSELECTED: 0.45,
+            PageLabel.FORM: 0.45,
+            PageLabel.KEY_VALUE_REGION: 0.45,
         }
 
         CLASS_REMAPPINGS = {
-            "Document Index": "Table",
+            PageLabel.DOCUMENT_INDEX: PageLabel.TABLE,
         }
 
         _log.debug("================= Start postprocess function ====================")
@@ -257,7 +258,7 @@ def postprocess(self, clusters: List[Cluster], cells: List[Cell], page_height):
                     coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT
                 ).to_top_left_origin(page_height),
                 confidence=c["confidence"],
-                label=c["type"],
+                label=PageLabel(c["type"]),
                 cells=cluster_cells,
             )
             clusters_out_new.append(c_new)
@@ -270,9 +271,12 @@ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
             for ix, pred_item in enumerate(
                 self.layout_predictor.predict(page.get_image(scale=1.0))
             ):
+                label = PageLabel(
+                    pred_item["label"].lower().replace(" ", "_").replace("-", "_")
+                )  # Temporary, until docling-ibm-model uses docling-core types
                 cluster = Cluster(
                     id=ix,
-                    label=pred_item["label"],
+                    label=label,
                     confidence=pred_item["confidence"],
                     bbox=BoundingBox.model_validate(pred_item),
                     cells=[],

diff --git a/docling/utils/layout_utils.py b/docling/utils/layout_utils.py
@@ -2,6 +2,7 @@
 import logging
 
 import networkx as nx
+from docling_core.types.experimental.labels import PageLabel
 
 logger = logging.getLogger("layout_utils")
 
@@ -370,7 +371,7 @@ def adapt_bboxes(raw_cells, clusters, orphan_cell_indices):
             "Treating cluster " + str(ix) + ", type " + str(new_cluster["type"])
         )
         logger.debug("  with cells: " + str(new_cluster["cell_ids"]))
-        if len(cluster["cell_ids"]) == 0 and cluster["type"] != "Picture":
+        if len(cluster["cell_ids"]) == 0 and cluster["type"] != PageLabel.PICTURE:
             logger.debug("  Empty non-picture, removed")
             continue  ## Skip this former cluster, now without cells.
         new_bbox = adapt_bbox(raw_cells, new_cluster, orphan_cell_indices)
@@ -380,14 +381,14 @@ def adapt_bboxes(raw_cells, clusters, orphan_cell_indices):
 
 
 def adapt_bbox(raw_cells, cluster, orphan_cell_indices):
-    if not (cluster["type"] in ["Table", "Picture"]):
+    if not (cluster["type"] in [PageLabel.TABLE, PageLabel.PICTURE]):
         ## A text-like cluster. The bbox only needs to be around the text cells:
         logger.debug("    Initial bbox: " + str(cluster["bbox"]))
         new_bbox = surrounding_list(
             [raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]]
         )
         logger.debug("  New bounding box:" + str(new_bbox))
-    if cluster["type"] == "Picture":
+    if cluster["type"] == PageLabel.PICTURE:
         ## We only make the bbox completely comprise included text cells:
         logger.debug("  Picture")
         if len(cluster["cell_ids"]) != 0:
@@ -587,7 +588,7 @@ def set_orphan_as_text(
     max_id = -1
     figures = []
     for cluster in cluster_predictions:
-        if cluster["type"] == "Picture":
+        if cluster["type"] == PageLabel.PICTURE:
             figures.append(cluster)
 
         if cluster["id"] > max_id:
@@ -638,13 +639,13 @@ def set_orphan_as_text(
             # if fig_flag == False and raw_cells[orph_id]["text"] not in line_orphans:
             if fig_flag == False and lines_detector == False:
                 # get class from low confidence detections if not set as text:
-                class_type = "Text"
+                class_type = PageLabel.TEXT
 
                 for cluster in cluster_predictions_low:
                     intersection = compute_intersection(
                         orph_cell["bbox"], cluster["bbox"]
                     )
-                    class_type = "Text"
+                    class_type = PageLabel.TEXT
                     if (
                         cluster["confidence"] > 0.1
                         and bb_iou(cluster["bbox"], orph_cell["bbox"]) > 0.4
@@ -718,7 +719,7 @@ def merge_cells(cluster_predictions):
                     if cluster["id"] == node:
                         lines.append(cluster)
                         cluster_predictions.remove(cluster)
-            new_merged_cluster = build_cluster_from_lines(lines, "Text", max_id)
+            new_merged_cluster = build_cluster_from_lines(lines, PageLabel.TEXT, max_id)
             cluster_predictions.append(new_merged_cluster)
     return cluster_predictions
 
@@ -753,9 +754,9 @@ def clean_up_clusters(
                 # remove clusters that might appear inside tables, or images (such as pdf cells in graphs)
                 elif img_table == True:
                     if (
-                        cluster_1["type"] == "Text"
-                        and cluster_2["type"] == "Picture"
-                        or cluster_2["type"] == "Table"
+                        cluster_1["type"] == PageLabel.TEXT
+                        and cluster_2["type"] == PageLabel.PICTURE
+                        or cluster_2["type"] == PageLabel.TABLE
                     ):
                         if bb_iou(cluster_1["bbox"], cluster_2["bbox"]) > 0.5:
                             DuplicateDeletedClusterIDs.append(cluster_1["id"])
@@ -771,7 +772,10 @@ def clean_up_clusters(
                             DuplicateDeletedClusterIDs.append(cluster_1["id"])
             # remove tables that have one pdf cell
             if one_cell_table == True:
-                if cluster_1["type"] == "Table" and len(cluster_1["cell_ids"]) < 2:
+                if (
+                    cluster_1["type"] == PageLabel.TABLE
+                    and len(cluster_1["cell_ids"]) < 2
+                ):
                     DuplicateDeletedClusterIDs.append(cluster_1["id"])
 
     DuplicateDeletedClusterIDs = list(set(DuplicateDeletedClusterIDs))

diff --git a/examples/batch_convert.py b/examples/batch_convert.py
@@ -48,6 +48,14 @@ def export_documents(
                     )
                 )
 
+            # Export Docling document format to doctags (experimental):
+            with (output_dir / f"{doc_filename}.experimental.doctags").open("w") as fp:
+                fp.write(conv_res.experimental.export_to_document_tokens())
+
+            # Export Docling document format to markdown (experimental):
+            with (output_dir / f"{doc_filename}.experimental.md").open("w") as fp:
+                fp.write(conv_res.experimental.export_to_markdown())
+
             # Export Text format:
             with (output_dir / f"{doc_filename}.txt").open("w") as fp:
                 fp.write(conv_res.render_as_text())

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -23,7 +23,7 @@ packages = [{include = "docling"}]
 [tool.poetry.dependencies]
 python = "^3.10"
 pydantic = "^2.0.0"
-docling-core = {git = "ssh://[email protected]/DS4SD/docling-core.git", branch = "cau/new-format-dev"}
+docling-core = {git = "ssh://[email protected]/DS4SD/docling-core.git", rev = "a83ff0056138d83ac2cb52bfb2ab1728ff86972f"}
 docling-ibm-models = "^1.2.0"
 deepsearch-glm = {git = "ssh://[email protected]/DS4SD/deepsearch-glm.git", branch = "cau/new-format-dev"}