Skip to content

Commit

Permalink
Introduce provenance info, use enum labels
Browse files Browse the repository at this point in the history
Signed-off-by: Christoph Auer <[email protected]>
  • Loading branch information
cau-git committed Sep 24, 2024
1 parent 40ef447 commit c26b52e
Show file tree
Hide file tree
Showing 3 changed files with 103 additions and 62 deletions.
45 changes: 40 additions & 5 deletions deepsearch_glm/utils/doc_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,12 @@
from typing import List

import pandas as pd
from docling_core.types.experimental.document import DoclingDocument, FileInfo, BaseFigureData, BaseTableData, TableCell
from docling_core.types.experimental.base import BoundingBox, CoordOrigin, Size
from docling_core.types.experimental.document import DoclingDocument, FileInfo, BaseFigureData, BaseTableData, \
TableCell, ProvenanceItem, PageItem

from docling_core.types.experimental.labels import PageLabel


def resolve_item(paths, obj):
"""Find item in document from a reference path"""
Expand Down Expand Up @@ -121,7 +126,10 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
],
}
doc_glm["page-elements"].remove(nelem)
caption_obj = doc.add_paragraph(label="caption", text=text)

prov = ProvenanceItem(page_no=nelem["page"], charspan=tuple(nelem["span"]), bbox=BoundingBox.from_tuple(nelem["bbox"], origin=CoordOrigin.BOTTOMLEFT))

caption_obj = doc.add_paragraph(label=PageLabel.CAPTION, text=text, prov=prov)
caption_refs.append(caption_obj.get_ref())

figure = {
Expand All @@ -140,6 +148,9 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
],
}

prov = ProvenanceItem(page_no=pelem["page"], charspan=(0, len(text)),
bbox=BoundingBox.from_tuple(pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT))

fig = doc.add_figure(data=BaseFigureData())
fig.captions.extend(caption_refs)

Expand Down Expand Up @@ -175,7 +186,11 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
],
}
doc_glm["page-elements"].remove(nelem)
caption_obj = doc.add_paragraph(label="caption", text=text)

prov = ProvenanceItem(page_no=pelem["page"], charspan=nelem["span"],
bbox=BoundingBox.from_tuple(pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT))

caption_obj = doc.add_paragraph(label=PageLabel.CAPTION, text=text, prov=prov)
caption_refs.append(caption_obj.get_ref())


Expand Down Expand Up @@ -221,7 +236,11 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
row_section: bool = False
"""
tbl_data = BaseTableData(num_rows=obj.get("#-rows", 0), num_cols=obj.get("#-cols", 0), table_cells=table_cells)
tbl = doc.add_table(data=tbl_data)

prov = ProvenanceItem(page_no=pelem["page"], charspan=(0, 0),
bbox=BoundingBox.from_tuple(pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT))

tbl = doc.add_table(data=tbl_data, prov=prov)
tbl.captions.extend(caption_refs)

elif "text" in obj:
Expand All @@ -248,7 +267,10 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
}
],
}
doc.add_paragraph(label=name_label, text=text)
prov = ProvenanceItem(page_no=pelem["page"], charspan=(0, len(text)),
bbox=BoundingBox.from_tuple(pelem["bbox"], origin=CoordOrigin.BOTTOMLEFT))

doc.add_paragraph(label=PageLabel(name_label), text=text, prov=prov)

else:
pitem = {
Expand All @@ -258,6 +280,19 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
{"bbox": pelem["bbox"], "page": pelem["page"], "span": [0, 0]}
],
}
# This branch should not be reachable.

page_to_hash = {
item["page"]: item["hash"]
for item in doc_glm["file-info"]["page-hashes"]
}

for page_dim in doc_glm["page-dimensions"]:
page_no = int(page_dim["page"])
size = Size(width=page_dim["width"], height=page_dim["height"])
hash = page_to_hash[page_no]

pitem = doc.add_page(page_no=page_no, size=size, hash=hash)

return doc

Expand Down
Loading

0 comments on commit c26b52e

Please sign in to comment.