Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add ContentLayer attribute to designate items to body or furniture #148

Merged
merged 10 commits into from
Feb 10, 2025
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ pip install docling-core

To develop for Docling Core, you need Python 3.9 / 3.10 / 3.11 / 3.12 / 3.13 and Poetry. You can then install from your local clone's root dir:
```bash
poetry install
poetry install --all-extras
```

To run the pytest suite, execute:
Expand Down
120 changes: 105 additions & 15 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import textwrap
import typing
import warnings
from enum import Enum
from io import BytesIO
from pathlib import Path
from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
Expand Down Expand Up @@ -46,7 +47,7 @@

Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
CURRENT_VERSION: Final = "1.0.0"
CURRENT_VERSION: Final = "1.1.0"

DEFAULT_EXPORT_LABELS = {
DocItemLabel.TITLE,
Expand All @@ -62,6 +63,8 @@
DocItemLabel.LIST_ITEM,
DocItemLabel.CODE,
DocItemLabel.REFERENCE,
DocItemLabel.PAGE_HEADER,
DocItemLabel.PAGE_FOOTER,
}


Expand Down Expand Up @@ -505,13 +508,25 @@ class ProvenanceItem(BaseModel):
charspan: Tuple[int, int]


class ContentLayer(str, Enum):
"""ContentLayer."""

BODY = "body"
FURNITURE = "furniture"


DEFAULT_CONTENT_LAYERS = {ContentLayer.BODY}


class NodeItem(BaseModel):
"""NodeItem."""

self_ref: str = Field(pattern=_JSON_POINTER_REGEX)
parent: Optional[RefItem] = None
children: List[RefItem] = []

content_layer: ContentLayer = ContentLayer.BODY

model_config = ConfigDict(extra="forbid")

def get_ref(self):
Expand Down Expand Up @@ -1419,8 +1434,8 @@ class DoclingDocument(BaseModel):
# generated from synthetic data.
)

furniture: GroupItem = GroupItem(
name="_root_", self_ref="#/furniture"
furniture: Annotated[GroupItem, Field(deprecated=True)] = GroupItem(
name="_root_", self_ref="#/furniture", content_layer=ContentLayer.FURNITURE
) # List[RefItem] = []
body: GroupItem = GroupItem(name="_root_", self_ref="#/body") # List[RefItem] = []

Expand All @@ -1432,11 +1447,28 @@ class DoclingDocument(BaseModel):

pages: Dict[int, PageItem] = {} # empty as default

@model_validator(mode="before")
@classmethod
def transform_to_content_layer(cls, data: dict) -> dict:
"""transform_to_content_layer."""
# Since version 1.1.0, all NodeItems carry content_layer property.
# We must assign previous page_header and page_footer instances to furniture.
# Note: model_validators which check on the version must use "before".
if "version" in data and data["version"] == "1.0.0":
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we have already logic in docling-core about checking version numbers: https://github.com/DS4SD/docling-core/blob/main/docling_core/types/doc/document.py#L2947

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but why do we need that logic? This is not about determining if the version is compatible, it is specifically checking if the version is 1.0.0 (the only version in existence prior to the current version).

for item in data.get("texts", []):
if "label" in item and item["label"] in [
DocItemLabel.PAGE_HEADER.value,
DocItemLabel.PAGE_FOOTER.value,
]:
item["content_layer"] = "furniture"
return data

def add_group(
self,
label: Optional[GroupLabel] = None,
name: Optional[str] = None,
parent: Optional[NodeItem] = None,
content_layer: Optional[ContentLayer] = None,
) -> GroupItem:
"""add_group.

Expand All @@ -1456,6 +1488,8 @@ def add_group(
group.name = name
if label is not None:
group.label = label
if content_layer:
group.content_layer = content_layer

self.groups.append(group)
parent.children.append(RefItem(cref=cref))
Expand All @@ -1470,6 +1504,7 @@ def add_list_item(
orig: Optional[str] = None,
prov: Optional[ProvenanceItem] = None,
parent: Optional[NodeItem] = None,
content_layer: Optional[ContentLayer] = None,
):
"""add_list_item.

Expand Down Expand Up @@ -1500,6 +1535,8 @@ def add_list_item(
)
if prov:
list_item.prov.append(prov)
if content_layer:
list_item.content_layer = content_layer

self.texts.append(list_item)
parent.children.append(RefItem(cref=cref))
Expand All @@ -1513,6 +1550,7 @@ def add_text(
orig: Optional[str] = None,
prov: Optional[ProvenanceItem] = None,
parent: Optional[NodeItem] = None,
content_layer: Optional[ContentLayer] = None,
):
"""add_text.

Expand All @@ -1526,16 +1564,40 @@ def add_text(
# Catch a few cases that are in principle allowed
# but that will create confusion down the road
if label in [DocItemLabel.TITLE]:
return self.add_title(text=text, orig=orig, prov=prov, parent=parent)
return self.add_title(
text=text,
orig=orig,
prov=prov,
parent=parent,
content_layer=content_layer,
)

elif label in [DocItemLabel.LIST_ITEM]:
return self.add_list_item(text=text, orig=orig, prov=prov, parent=parent)
return self.add_list_item(
text=text,
orig=orig,
prov=prov,
parent=parent,
content_layer=content_layer,
)

elif label in [DocItemLabel.SECTION_HEADER]:
return self.add_heading(text=text, orig=orig, prov=prov, parent=parent)
return self.add_heading(
text=text,
orig=orig,
prov=prov,
parent=parent,
content_layer=content_layer,
)

elif label in [DocItemLabel.CODE]:
return self.add_code(text=text, orig=orig, prov=prov, parent=parent)
return self.add_code(
text=text,
orig=orig,
prov=prov,
parent=parent,
content_layer=content_layer,
)

else:

Expand All @@ -1557,6 +1619,9 @@ def add_text(
if prov:
text_item.prov.append(prov)

if content_layer:
text_item.content_layer = content_layer

self.texts.append(text_item)
parent.children.append(RefItem(cref=cref))

Expand All @@ -1569,6 +1634,7 @@ def add_table(
prov: Optional[ProvenanceItem] = None,
parent: Optional[NodeItem] = None,
label: DocItemLabel = DocItemLabel.TABLE,
content_layer: Optional[ContentLayer] = None,
):
"""add_table.

Expand All @@ -1590,6 +1656,9 @@ def add_table(
)
if prov:
tbl_item.prov.append(prov)
if content_layer:
tbl_item.content_layer = content_layer

if caption:
tbl_item.captions.append(caption.get_ref())

Expand All @@ -1605,6 +1674,7 @@ def add_picture(
caption: Optional[Union[TextItem, RefItem]] = None,
prov: Optional[ProvenanceItem] = None,
parent: Optional[NodeItem] = None,
content_layer: Optional[ContentLayer] = None,
):
"""add_picture.

Expand All @@ -1629,6 +1699,8 @@ def add_picture(
)
if prov:
fig_item.prov.append(prov)
if content_layer:
fig_item.content_layer = content_layer
if caption:
fig_item.captions.append(caption.get_ref())

Expand All @@ -1643,6 +1715,7 @@ def add_title(
orig: Optional[str] = None,
prov: Optional[ProvenanceItem] = None,
parent: Optional[NodeItem] = None,
content_layer: Optional[ContentLayer] = None,
):
"""add_title.

Expand All @@ -1668,6 +1741,8 @@ def add_title(
)
if prov:
text_item.prov.append(prov)
if content_layer:
text_item.content_layer = content_layer

self.texts.append(text_item)
parent.children.append(RefItem(cref=cref))
Expand All @@ -1681,6 +1756,7 @@ def add_code(
orig: Optional[str] = None,
prov: Optional[ProvenanceItem] = None,
parent: Optional[NodeItem] = None,
content_layer: Optional[ContentLayer] = None,
):
"""add_code.

Expand All @@ -1706,6 +1782,8 @@ def add_code(
)
if code_language:
code_item.code_language = code_language
if content_layer:
code_item.content_layer = content_layer
if prov:
code_item.prov.append(prov)

Expand All @@ -1721,6 +1799,7 @@ def add_heading(
level: LevelNumber = 1,
prov: Optional[ProvenanceItem] = None,
parent: Optional[NodeItem] = None,
content_layer: Optional[ContentLayer] = None,
):
"""add_heading.

Expand Down Expand Up @@ -1748,6 +1827,8 @@ def add_heading(
)
if prov:
section_header_item.prov.append(prov)
if content_layer:
section_header_item.content_layer = content_layer

self.texts.append(section_header_item)
parent.children.append(RefItem(cref=cref))
Expand Down Expand Up @@ -1775,6 +1856,7 @@ def iterate_items(
with_groups: bool = False,
traverse_pictures: bool = False,
page_no: Optional[int] = None,
included_content_layers: set[ContentLayer] = DEFAULT_CONTENT_LAYERS,
_level: int = 0, # fixed parameter, carries through the node nesting level
) -> typing.Iterable[Tuple[NodeItem, int]]: # tuple of node and level
"""iterate_elements.
Expand All @@ -1791,14 +1873,22 @@ def iterate_items(
root = self.body

# Yield non-group items or group items when with_groups=True
if not isinstance(root, GroupItem) or with_groups:
if isinstance(root, DocItem):
if page_no is None or any(
prov.page_no == page_no for prov in root.prov
):
yield root, _level
else:
yield root, _level

# Combine conditions to have a single yield point
should_yield = (
(not isinstance(root, GroupItem) or with_groups)
and (
not isinstance(root, DocItem)
or (
page_no is None
or any(prov.page_no == page_no for prov in root.prov)
)
)
and root.content_layer in included_content_layers
)

if should_yield:
yield root, _level

# Handle picture traversal - only traverse children if requested
if isinstance(root, PictureItem) and not traverse_pictures:
Expand Down
6 changes: 3 additions & 3 deletions docling_core/utils/legacy.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
TableItem,
TextItem,
)
from docling_core.types.doc.document import GroupItem, ListItem, TableData
from docling_core.types.doc.document import ContentLayer, GroupItem, ListItem, TableData
from docling_core.types.doc.labels import GroupLabel
from docling_core.types.legacy_doc.base import (
BaseCell,
Expand Down Expand Up @@ -400,7 +400,7 @@ def _transform_prov(item: BaseCell) -> Optional[ProvenanceItem]:
doc.add_text(
label=DocItemLabel.PAGE_HEADER,
text=text_item.text,
parent=doc.furniture,
content_layer=ContentLayer.FURNITURE,
)

# page footers
Expand All @@ -412,7 +412,7 @@ def _transform_prov(item: BaseCell) -> Optional[ProvenanceItem]:
doc.add_text(
label=DocItemLabel.PAGE_FOOTER,
text=text_item.text,
parent=doc.furniture,
content_layer=ContentLayer.FURNITURE,
)

# footnotes
Expand Down
Loading