Skip to content

Commit

Permalink
feat: Add ContentLayer attribute to designate items to body or furnit…
Browse files Browse the repository at this point in the history
…ure (#148)

* feat: Add ContentLayer attribute to designate items to body or furniture

Signed-off-by: Christoph Auer <[email protected]>

* introduce safer data gen mechanism, update chunking test data

Signed-off-by: Panos Vagenas <[email protected]>

* Do not make test rely on order in yaml

Signed-off-by: Christoph Auer <[email protected]>

* chore: format fixes

Signed-off-by: Christoph Auer <[email protected]>

* fix: legacy_to_docling_doc must use content_layer

Signed-off-by: Christoph Auer <[email protected]>

* Add content_layer in iterate_items

Signed-off-by: Christoph Auer <[email protected]>

* Bump format version, add model_validator for old page_header,page_footer in body

Signed-off-by: Christoph Auer <[email protected]>

* fix: Change to before model_validator

Signed-off-by: Christoph Auer <[email protected]>

* Update tests

Signed-off-by: Christoph Auer <[email protected]>

* Address review comments

Signed-off-by: Christoph Auer <[email protected]>

---------

Signed-off-by: Christoph Auer <[email protected]>
Signed-off-by: Panos Vagenas <[email protected]>
Co-authored-by: Panos Vagenas <[email protected]>
  • Loading branch information
cau-git and vagenas authored Feb 10, 2025
1 parent 794c00d commit 786f0c6
Show file tree
Hide file tree
Showing 28 changed files with 848 additions and 1,147 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ pip install docling-core

To develop for Docling Core, you need Python 3.9 / 3.10 / 3.11 / 3.12 / 3.13 and Poetry. You can then install from your local clone's root dir:
```bash
poetry install
poetry install --all-extras
```

To run the pytest suite, execute:
Expand Down
120 changes: 105 additions & 15 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import textwrap
import typing
import warnings
from enum import Enum
from io import BytesIO
from pathlib import Path
from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
Expand Down Expand Up @@ -54,7 +55,7 @@

Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
CURRENT_VERSION: Final = "1.0.0"
CURRENT_VERSION: Final = "1.1.0"

DEFAULT_EXPORT_LABELS = {
DocItemLabel.TITLE,
Expand All @@ -70,6 +71,8 @@
DocItemLabel.LIST_ITEM,
DocItemLabel.CODE,
DocItemLabel.REFERENCE,
DocItemLabel.PAGE_HEADER,
DocItemLabel.PAGE_FOOTER,
}


Expand Down Expand Up @@ -513,13 +516,25 @@ class ProvenanceItem(BaseModel):
charspan: Tuple[int, int]


class ContentLayer(str, Enum):
"""ContentLayer."""

BODY = "body"
FURNITURE = "furniture"


DEFAULT_CONTENT_LAYERS = {ContentLayer.BODY}


class NodeItem(BaseModel):
"""NodeItem."""

self_ref: str = Field(pattern=_JSON_POINTER_REGEX)
parent: Optional[RefItem] = None
children: List[RefItem] = []

content_layer: ContentLayer = ContentLayer.BODY

model_config = ConfigDict(extra="forbid")

def get_ref(self):
Expand Down Expand Up @@ -1442,8 +1457,8 @@ class DoclingDocument(BaseModel):
# generated from synthetic data.
)

furniture: GroupItem = GroupItem(
name="_root_", self_ref="#/furniture"
furniture: Annotated[GroupItem, Field(deprecated=True)] = GroupItem(
name="_root_", self_ref="#/furniture", content_layer=ContentLayer.FURNITURE
) # List[RefItem] = []
body: GroupItem = GroupItem(name="_root_", self_ref="#/body") # List[RefItem] = []

Expand All @@ -1455,11 +1470,28 @@ class DoclingDocument(BaseModel):

pages: Dict[int, PageItem] = {} # empty as default

@model_validator(mode="before")
@classmethod
def transform_to_content_layer(cls, data: dict) -> dict:
"""transform_to_content_layer."""
# Since version 1.1.0, all NodeItems carry content_layer property.
# We must assign previous page_header and page_footer instances to furniture.
# Note: model_validators which check on the version must use "before".
if "version" in data and data["version"] == "1.0.0":
for item in data.get("texts", []):
if "label" in item and item["label"] in [
DocItemLabel.PAGE_HEADER.value,
DocItemLabel.PAGE_FOOTER.value,
]:
item["content_layer"] = "furniture"
return data

def add_group(
self,
label: Optional[GroupLabel] = None,
name: Optional[str] = None,
parent: Optional[NodeItem] = None,
content_layer: Optional[ContentLayer] = None,
) -> GroupItem:
"""add_group.
Expand All @@ -1479,6 +1511,8 @@ def add_group(
group.name = name
if label is not None:
group.label = label
if content_layer:
group.content_layer = content_layer

self.groups.append(group)
parent.children.append(RefItem(cref=cref))
Expand All @@ -1493,6 +1527,7 @@ def add_list_item(
orig: Optional[str] = None,
prov: Optional[ProvenanceItem] = None,
parent: Optional[NodeItem] = None,
content_layer: Optional[ContentLayer] = None,
):
"""add_list_item.
Expand Down Expand Up @@ -1523,6 +1558,8 @@ def add_list_item(
)
if prov:
list_item.prov.append(prov)
if content_layer:
list_item.content_layer = content_layer

self.texts.append(list_item)
parent.children.append(RefItem(cref=cref))
Expand All @@ -1536,6 +1573,7 @@ def add_text(
orig: Optional[str] = None,
prov: Optional[ProvenanceItem] = None,
parent: Optional[NodeItem] = None,
content_layer: Optional[ContentLayer] = None,
):
"""add_text.
Expand All @@ -1549,16 +1587,40 @@ def add_text(
# Catch a few cases that are in principle allowed
# but that will create confusion down the road
if label in [DocItemLabel.TITLE]:
return self.add_title(text=text, orig=orig, prov=prov, parent=parent)
return self.add_title(
text=text,
orig=orig,
prov=prov,
parent=parent,
content_layer=content_layer,
)

elif label in [DocItemLabel.LIST_ITEM]:
return self.add_list_item(text=text, orig=orig, prov=prov, parent=parent)
return self.add_list_item(
text=text,
orig=orig,
prov=prov,
parent=parent,
content_layer=content_layer,
)

elif label in [DocItemLabel.SECTION_HEADER]:
return self.add_heading(text=text, orig=orig, prov=prov, parent=parent)
return self.add_heading(
text=text,
orig=orig,
prov=prov,
parent=parent,
content_layer=content_layer,
)

elif label in [DocItemLabel.CODE]:
return self.add_code(text=text, orig=orig, prov=prov, parent=parent)
return self.add_code(
text=text,
orig=orig,
prov=prov,
parent=parent,
content_layer=content_layer,
)

else:

Expand All @@ -1580,6 +1642,9 @@ def add_text(
if prov:
text_item.prov.append(prov)

if content_layer:
text_item.content_layer = content_layer

self.texts.append(text_item)
parent.children.append(RefItem(cref=cref))

Expand All @@ -1592,6 +1657,7 @@ def add_table(
prov: Optional[ProvenanceItem] = None,
parent: Optional[NodeItem] = None,
label: DocItemLabel = DocItemLabel.TABLE,
content_layer: Optional[ContentLayer] = None,
):
"""add_table.
Expand All @@ -1613,6 +1679,9 @@ def add_table(
)
if prov:
tbl_item.prov.append(prov)
if content_layer:
tbl_item.content_layer = content_layer

if caption:
tbl_item.captions.append(caption.get_ref())

Expand All @@ -1628,6 +1697,7 @@ def add_picture(
caption: Optional[Union[TextItem, RefItem]] = None,
prov: Optional[ProvenanceItem] = None,
parent: Optional[NodeItem] = None,
content_layer: Optional[ContentLayer] = None,
):
"""add_picture.
Expand All @@ -1652,6 +1722,8 @@ def add_picture(
)
if prov:
fig_item.prov.append(prov)
if content_layer:
fig_item.content_layer = content_layer
if caption:
fig_item.captions.append(caption.get_ref())

Expand All @@ -1666,6 +1738,7 @@ def add_title(
orig: Optional[str] = None,
prov: Optional[ProvenanceItem] = None,
parent: Optional[NodeItem] = None,
content_layer: Optional[ContentLayer] = None,
):
"""add_title.
Expand All @@ -1691,6 +1764,8 @@ def add_title(
)
if prov:
text_item.prov.append(prov)
if content_layer:
text_item.content_layer = content_layer

self.texts.append(text_item)
parent.children.append(RefItem(cref=cref))
Expand All @@ -1704,6 +1779,7 @@ def add_code(
orig: Optional[str] = None,
prov: Optional[ProvenanceItem] = None,
parent: Optional[NodeItem] = None,
content_layer: Optional[ContentLayer] = None,
):
"""add_code.
Expand All @@ -1729,6 +1805,8 @@ def add_code(
)
if code_language:
code_item.code_language = code_language
if content_layer:
code_item.content_layer = content_layer
if prov:
code_item.prov.append(prov)

Expand All @@ -1744,6 +1822,7 @@ def add_heading(
level: LevelNumber = 1,
prov: Optional[ProvenanceItem] = None,
parent: Optional[NodeItem] = None,
content_layer: Optional[ContentLayer] = None,
):
"""add_heading.
Expand Down Expand Up @@ -1771,6 +1850,8 @@ def add_heading(
)
if prov:
section_header_item.prov.append(prov)
if content_layer:
section_header_item.content_layer = content_layer

self.texts.append(section_header_item)
parent.children.append(RefItem(cref=cref))
Expand Down Expand Up @@ -1798,6 +1879,7 @@ def iterate_items(
with_groups: bool = False,
traverse_pictures: bool = False,
page_no: Optional[int] = None,
included_content_layers: set[ContentLayer] = DEFAULT_CONTENT_LAYERS,
_level: int = 0, # fixed parameter, carries through the node nesting level
) -> typing.Iterable[Tuple[NodeItem, int]]: # tuple of node and level
"""iterate_elements.
Expand All @@ -1814,14 +1896,22 @@ def iterate_items(
root = self.body

# Yield non-group items or group items when with_groups=True
if not isinstance(root, GroupItem) or with_groups:
if isinstance(root, DocItem):
if page_no is None or any(
prov.page_no == page_no for prov in root.prov
):
yield root, _level
else:
yield root, _level

# Combine conditions to have a single yield point
should_yield = (
(not isinstance(root, GroupItem) or with_groups)
and (
not isinstance(root, DocItem)
or (
page_no is None
or any(prov.page_no == page_no for prov in root.prov)
)
)
and root.content_layer in included_content_layers
)

if should_yield:
yield root, _level

# Handle picture traversal - only traverse children if requested
if isinstance(root, PictureItem) and not traverse_pictures:
Expand Down
6 changes: 3 additions & 3 deletions docling_core/utils/legacy.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
TableItem,
TextItem,
)
from docling_core.types.doc.document import GroupItem, ListItem, TableData
from docling_core.types.doc.document import ContentLayer, GroupItem, ListItem, TableData
from docling_core.types.doc.labels import GroupLabel
from docling_core.types.legacy_doc.base import (
BaseCell,
Expand Down Expand Up @@ -400,7 +400,7 @@ def _transform_prov(item: BaseCell) -> Optional[ProvenanceItem]:
doc.add_text(
label=DocItemLabel.PAGE_HEADER,
text=text_item.text,
parent=doc.furniture,
content_layer=ContentLayer.FURNITURE,
)

# page footers
Expand All @@ -412,7 +412,7 @@ def _transform_prov(item: BaseCell) -> Optional[ProvenanceItem]:
doc.add_text(
label=DocItemLabel.PAGE_FOOTER,
text=text_item.text,
parent=doc.furniture,
content_layer=ContentLayer.FURNITURE,
)

# footnotes
Expand Down
Loading

0 comments on commit 786f0c6

Please sign in to comment.