Skip to content

Commit

Permalink
Put stub for experimental format export
Browse files Browse the repository at this point in the history
Signed-off-by: Christoph Auer <[email protected]>
  • Loading branch information
cau-git committed Sep 20, 2024
1 parent abb6ddd commit ac51a09
Show file tree
Hide file tree
Showing 5 changed files with 90 additions and 71 deletions.
5 changes: 4 additions & 1 deletion docling/datamodel/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from pathlib import Path, PurePath
from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union

from docling_core.types import BaseCell, BaseText
from docling_core.types import BaseCell, BaseText, DoclingDocument
from docling_core.types import Document as DsDocument
from docling_core.types import DocumentDescription as DsDocumentDescription
from docling_core.types import FileInfoObject as DsFileInfoObject
Expand Down Expand Up @@ -60,6 +60,8 @@
),
)

_EMPTY_DOCLING_DOC = DoclingDocument(description={}, file_info={}) # TODO: Stub


class InputDocument(BaseModel):
file: PurePath = None
Expand Down Expand Up @@ -137,6 +139,7 @@ class ConvertedDocument(BaseModel):
assembled: AssembledUnit = AssembledUnit()

output: DsDocument = _EMPTY_DOC
experimental: DoclingDocument = _EMPTY_DOCLING_DOC

def _to_ds_document(self) -> DsDocument:
title = ""
Expand Down
2 changes: 1 addition & 1 deletion docling/document_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,4 +289,4 @@ def _assemble_doc(self, conv_res: ConversionResult):
elements=all_elements, headers=all_headers, body=all_body
)

conv_res.output = self.glm_model(conv_res)
conv_res.output, conv_res.experimental = self.glm_model(conv_res)
15 changes: 11 additions & 4 deletions docling/models/ds_glm_model.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
import copy
import random
from typing import Tuple

from deepsearch_glm.nlp_utils import init_nlp_model
from deepsearch_glm.utils.doc_utils import to_legacy_document_format
from deepsearch_glm.utils.doc_utils import (
to_docling_document,
to_legacy_document_format,
)
from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
from docling_core.types import BaseText
from docling_core.types import BaseText, DoclingDocument
from docling_core.types import Document as DsDocument
from docling_core.types import Ref
from docling_core.types.experimental.base import BoundingBox, CoordOrigin
Expand All @@ -25,7 +29,9 @@ def __init__(self, config):
model = init_nlp_model(model_names=self.model_names)
self.model = model

def __call__(self, conv_res: ConversionResult) -> DsDocument:
def __call__(
self, conv_res: ConversionResult
) -> Tuple[DsDocument, DoclingDocument]:
ds_doc = conv_res._to_ds_document()
ds_doc_dict = ds_doc.model_dump(by_alias=True)

Expand All @@ -34,6 +40,7 @@ def __call__(self, conv_res: ConversionResult) -> DsDocument:
glm_doc, ds_doc_dict, update_name_label=True
)

docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
exported_doc = DsDocument.model_validate(ds_doc_dict)

# DEBUG code:
Expand Down Expand Up @@ -84,4 +91,4 @@ def draw_clusters_and_cells(ds_document, page_no):
# draw_clusters_and_cells(ds_doc, 0)
# draw_clusters_and_cells(exported_doc, 0)

return exported_doc
return (exported_doc, docling_doc)
10 changes: 10 additions & 0 deletions examples/batch_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from pathlib import Path
from typing import Iterable

import yaml

from docling.datamodel.base_models import ConversionStatus, PipelineOptions
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.document_converter import DocumentConverter
Expand All @@ -30,6 +32,14 @@ def export_documents(
with (output_dir / f"{doc_filename}.json").open("w") as fp:
fp.write(json.dumps(conv_res.render_as_dict()))

# Export Docling document format to YAML (experimental):
with (output_dir / f"{doc_filename}.yaml").open("w") as fp:
fp.write(
yaml.safe_dump(
conv_res.experimental.model_dump(mode="json", by_alias=True)
)
)

# Export Text format:
with (output_dir / f"{doc_filename}.txt").open("w") as fp:
fp.write(conv_res.render_as_text())
Expand Down
Loading

0 comments on commit ac51a09

Please sign in to comment.