Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add content_layer property to items to address body, furniture and other roles #735

Merged
merged 10 commits into from
Feb 10, 2025
62 changes: 60 additions & 2 deletions docling/models/ds_glm_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,12 @@
from typing import List, Union

from deepsearch_glm.andromeda_nlp import nlp_model
from docling_core.types.doc import BoundingBox, CoordOrigin, DoclingDocument
from docling_core.types.doc import (
BoundingBox,
CoordOrigin,
DocItemLabel,
DoclingDocument,
)
from docling_core.types.legacy_doc.base import BoundingBox as DsBoundingBox
from docling_core.types.legacy_doc.base import (
Figure,
Expand Down Expand Up @@ -71,12 +76,15 @@ def _to_legacy_document(self, conv_res) -> DsDocument:
)

main_text: List[Union[Ref, BaseText]] = []
page_headers: List[Union[Ref, BaseText]] = []
page_footers: List[Union[Ref, BaseText]] = []

tables: List[DsSchemaTable] = []
figures: List[Figure] = []

page_no_to_page = {p.page_no: p for p in conv_res.pages}

for element in conv_res.assembled.elements:
for element in conv_res.assembled.body:
# Convert bboxes to lower-left origin.
target_bbox = DsBoundingBox(
element.cluster.bbox.to_bottom_left_origin(
Expand Down Expand Up @@ -238,6 +246,53 @@ def make_spans(cell):
)
)

# We can throw in headers and footers at the end of the legacy doc
# since the reading-order will re-sort it later.
for element in conv_res.assembled.headers:
# Convert bboxes to lower-left origin.
target_bbox = DsBoundingBox(
element.cluster.bbox.to_bottom_left_origin(
page_no_to_page[element.page_no].size.height
).as_tuple()
)

if isinstance(element, TextElement):

tel = BaseText(
text=element.text,
obj_type=layout_label_to_ds_type.get(element.label),
name=element.label,
prov=[
Prov(
bbox=target_bbox,
page=element.page_no + 1,
span=[0, len(element.text)],
)
],
)
if element.label == DocItemLabel.PAGE_HEADER:
index = len(page_headers)
ref_str = f"#/page-headers/{index}"
main_text.append(
Ref(
name=element.label,
obj_type=layout_label_to_ds_type.get(element.label),
ref=ref_str,
),
)
page_headers.append(tel)
elif element.label == DocItemLabel.PAGE_FOOTER:
index = len(page_footers)
ref_str = f"#/page-footers/{index}"
main_text.append(
Ref(
name=element.label,
obj_type=layout_label_to_ds_type.get(element.label),
ref=ref_str,
),
)
page_footers.append(tel)

page_dimensions = [
PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
for p in conv_res.pages
Expand All @@ -252,6 +307,8 @@ def make_spans(cell):
tables=tables,
figures=figures,
page_dimensions=page_dimensions,
page_headers=page_headers,
page_footers=page_footers,
)

return ds_doc
Expand All @@ -264,6 +321,7 @@ def __call__(self, conv_res: ConversionResult) -> DoclingDocument:
glm_doc = self.model.apply_on_doc(ds_doc_dict)

docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
1 == 1

# DEBUG code:
def draw_clusters_and_cells(ds_document, page_no, show: bool = False):
Expand Down
9 changes: 9 additions & 0 deletions docling/utils/glm_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,15 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
current_list = None

doc.add_heading(text=text, prov=prov)
elif label in [DocItemLabel.PAGE_HEADER, DocItemLabel.PAGE_FOOTER]:
current_list = None

doc.add_text(
label=DocItemLabel(name_label),
text=text,
prov=prov,
parent=doc.furniture,
)
else:
current_list = None

Expand Down
Loading