Skip to content

Commit

Permalink
feat: add Docling JSON ingestion (#783)
Browse files Browse the repository at this point in the history
* feat: add Docling JSON ingestion

Signed-off-by: Panos Vagenas <[email protected]>

* update conversion as per review comments, add tests, revert Docling JSON disambiguation, document intricacies

Signed-off-by: Panos Vagenas <[email protected]>

* Update docling/backend/json/docling_json_backend.py

Co-authored-by: Cesar Berrospi Ramis <[email protected]>
Signed-off-by: Panos Vagenas <[email protected]>

---------

Signed-off-by: Panos Vagenas <[email protected]>
Co-authored-by: Cesar Berrospi Ramis <[email protected]>
  • Loading branch information
vagenas and ceberam authored Jan 24, 2025
1 parent e9768ae commit 88a0e66
Show file tree
Hide file tree
Showing 8 changed files with 144 additions and 1 deletion.
1 change: 0 additions & 1 deletion docling/backend/abstract_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ def is_valid(self) -> bool:
def supports_pagination(cls) -> bool:
pass

@abstractmethod
def unload(self):
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close()
Expand Down
Empty file.
58 changes: 58 additions & 0 deletions docling/backend/json/docling_json_backend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from io import BytesIO
from pathlib import Path
from typing import Union

from docling_core.types.doc import DoclingDocument
from typing_extensions import override

from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument


class DoclingJSONBackend(DeclarativeDocumentBackend):
@override
def __init__(
self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]
) -> None:
super().__init__(in_doc, path_or_stream)

# given we need to store any actual conversion exception for raising it from
# convert(), this captures the successful result or the actual error in a
# mutually exclusive way:
self._doc_or_err = self._get_doc_or_err()

@override
def is_valid(self) -> bool:
return isinstance(self._doc_or_err, DoclingDocument)

@classmethod
@override
def supports_pagination(cls) -> bool:
return False

@classmethod
@override
def supported_formats(cls) -> set[InputFormat]:
return {InputFormat.JSON_DOCLING}

def _get_doc_or_err(self) -> Union[DoclingDocument, Exception]:
try:
json_data: Union[str, bytes]
if isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, encoding="utf-8") as f:
json_data = f.read()
elif isinstance(self.path_or_stream, BytesIO):
json_data = self.path_or_stream.getvalue()
else:
raise RuntimeError(f"Unexpected: {type(self.path_or_stream)=}")
return DoclingDocument.model_validate_json(json_data=json_data)
except Exception as e:
return e

@override
def convert(self) -> DoclingDocument:
if isinstance(self._doc_or_err, DoclingDocument):
return self._doc_or_err
else:
raise self._doc_or_err
3 changes: 3 additions & 0 deletions docling/datamodel/base_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ class InputFormat(str, Enum):
MD = "md"
XLSX = "xlsx"
XML_USPTO = "xml_uspto"
JSON_DOCLING = "json_docling"


class OutputFormat(str, Enum):
Expand All @@ -62,6 +63,7 @@ class OutputFormat(str, Enum):
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
InputFormat.XLSX: ["xlsx"],
InputFormat.XML_USPTO: ["xml", "txt"],
InputFormat.JSON_DOCLING: ["json"],
}

FormatToMimeType: Dict[InputFormat, List[str]] = {
Expand Down Expand Up @@ -90,6 +92,7 @@ class OutputFormat(str, Enum):
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
],
InputFormat.XML_USPTO: ["application/xml", "text/plain"],
InputFormat.JSON_DOCLING: ["application/json"],
}

MimeTypeToFormat: dict[str, list[InputFormat]] = {
Expand Down
2 changes: 2 additions & 0 deletions docling/datamodel/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,8 @@ def _mime_from_extension(ext):
mime = FormatToMimeType[InputFormat.HTML][0]
elif ext in FormatToExtensions[InputFormat.MD]:
mime = FormatToMimeType[InputFormat.MD][0]
elif ext in FormatToExtensions[InputFormat.JSON_DOCLING]:
mime = FormatToMimeType[InputFormat.JSON_DOCLING][0]
return mime

@staticmethod
Expand Down
4 changes: 4 additions & 0 deletions docling/document_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from docling.backend.asciidoc_backend import AsciiDocBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend
from docling.backend.json.docling_json_backend import DoclingJSONBackend
from docling.backend.md_backend import MarkdownDocumentBackend
from docling.backend.msexcel_backend import MsExcelDocumentBackend
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
Expand Down Expand Up @@ -136,6 +137,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
InputFormat.PDF: FormatOption(
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
),
InputFormat.JSON_DOCLING: FormatOption(
pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
),
}
if (options := format_to_default_options.get(format)) is not None:
return options
Expand Down
58 changes: 58 additions & 0 deletions tests/test_backend_docling_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
"""Test methods in module docling.backend.json.docling_json_backend.py."""

from io import BytesIO
from pathlib import Path

import pytest
from pydantic import ValidationError

from docling.backend.json.docling_json_backend import DoclingJSONBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import DoclingDocument, InputDocument

GT_PATH: Path = Path("./tests/data/groundtruth/docling_v2/2206.01062.json")


def test_convert_valid_docling_json():
"""Test ingestion of valid Docling JSON."""
cls = DoclingJSONBackend
path_or_stream = GT_PATH
in_doc = InputDocument(
path_or_stream=path_or_stream,
format=InputFormat.JSON_DOCLING,
backend=cls,
)
backend = cls(
in_doc=in_doc,
path_or_stream=path_or_stream,
)
assert backend.is_valid()

act_doc = backend.convert()
act_data = act_doc.export_to_dict()

exp_doc = DoclingDocument.load_from_json(GT_PATH)
exp_data = exp_doc.export_to_dict()

assert act_data == exp_data


def test_invalid_docling_json():
"""Test ingestion of invalid Docling JSON."""
cls = DoclingJSONBackend
path_or_stream = BytesIO(b"{}")
in_doc = InputDocument(
path_or_stream=path_or_stream,
format=InputFormat.JSON_DOCLING,
backend=cls,
filename="foo",
)
backend = cls(
in_doc=in_doc,
path_or_stream=path_or_stream,
)

assert not backend.is_valid()

with pytest.raises(ValidationError):
backend.convert()
19 changes: 19 additions & 0 deletions tests/test_input_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,25 @@ def test_guess_format(tmp_path):
doc_path.write_text("xyz", encoding="utf-8")
assert dci._guess_format(doc_path) == None

# Valid Docling JSON
test_str = '{"name": ""}'
stream = DocumentStream(name="test.json", stream=BytesIO(f"{test_str}".encode()))
assert dci._guess_format(stream) == InputFormat.JSON_DOCLING
doc_path = temp_dir / "test.json"
doc_path.write_text(test_str, encoding="utf-8")
assert dci._guess_format(doc_path) == InputFormat.JSON_DOCLING

# Non-Docling JSON
# TODO: Docling JSON is currently the single supported JSON flavor and the pipeline
# will try to validate *any* JSON (based on suffix/MIME) as Docling JSON; proper
# disambiguation seen as part of https://github.com/DS4SD/docling/issues/802
test_str = "{}"
stream = DocumentStream(name="test.json", stream=BytesIO(f"{test_str}".encode()))
assert dci._guess_format(stream) == InputFormat.JSON_DOCLING
doc_path = temp_dir / "test.json"
doc_path.write_text(test_str, encoding="utf-8")
assert dci._guess_format(doc_path) == InputFormat.JSON_DOCLING


def _make_input_doc(path):
in_doc = InputDocument(
Expand Down

0 comments on commit 88a0e66

Please sign in to comment.