From 88a0e66adc19238f57a942b0504926cdaeacd8cc Mon Sep 17 00:00:00 2001 From: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Date: Fri, 24 Jan 2025 18:05:23 +0100 Subject: [PATCH] feat: add Docling JSON ingestion (#783) * feat: add Docling JSON ingestion Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> * update conversion as per review comments, add tests, revert Docling JSON disambiguation, document intricacies Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> * Update docling/backend/json/docling_json_backend.py Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --------- Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --- docling/backend/abstract_backend.py | 1 - docling/backend/json/__init__.py | 0 docling/backend/json/docling_json_backend.py | 58 ++++++++++++++++++++ docling/datamodel/base_models.py | 3 + docling/datamodel/document.py | 2 + docling/document_converter.py | 4 ++ tests/test_backend_docling_json.py | 58 ++++++++++++++++++++ tests/test_input_doc.py | 19 +++++++ 8 files changed, 144 insertions(+), 1 deletion(-) create mode 100644 docling/backend/json/__init__.py create mode 100644 docling/backend/json/docling_json_backend.py create mode 100644 tests/test_backend_docling_json.py diff --git a/docling/backend/abstract_backend.py b/docling/backend/abstract_backend.py index b47b11cd..491330b3 100644 --- a/docling/backend/abstract_backend.py +++ b/docling/backend/abstract_backend.py @@ -27,7 +27,6 @@ def is_valid(self) -> bool: def supports_pagination(cls) -> bool: pass - @abstractmethod def unload(self): if isinstance(self.path_or_stream, BytesIO): self.path_or_stream.close() diff --git a/docling/backend/json/__init__.py b/docling/backend/json/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docling/backend/json/docling_json_backend.py b/docling/backend/json/docling_json_backend.py new file mode 100644 index 00000000..73ac6972 --- /dev/null +++ b/docling/backend/json/docling_json_backend.py @@ -0,0 +1,58 @@ +from io import BytesIO +from pathlib import Path +from typing import Union + +from docling_core.types.doc import DoclingDocument +from typing_extensions import override + +from docling.backend.abstract_backend import DeclarativeDocumentBackend +from docling.datamodel.base_models import InputFormat +from docling.datamodel.document import InputDocument + + +class DoclingJSONBackend(DeclarativeDocumentBackend): + @override + def __init__( + self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path] + ) -> None: + super().__init__(in_doc, path_or_stream) + + # given we need to store any actual conversion exception for raising it from + # convert(), this captures the successful result or the actual error in a + # mutually exclusive way: + self._doc_or_err = self._get_doc_or_err() + + @override + def is_valid(self) -> bool: + return isinstance(self._doc_or_err, DoclingDocument) + + @classmethod + @override + def supports_pagination(cls) -> bool: + return False + + @classmethod + @override + def supported_formats(cls) -> set[InputFormat]: + return {InputFormat.JSON_DOCLING} + + def _get_doc_or_err(self) -> Union[DoclingDocument, Exception]: + try: + json_data: Union[str, bytes] + if isinstance(self.path_or_stream, Path): + with open(self.path_or_stream, encoding="utf-8") as f: + json_data = f.read() + elif isinstance(self.path_or_stream, BytesIO): + json_data = self.path_or_stream.getvalue() + else: + raise RuntimeError(f"Unexpected: {type(self.path_or_stream)=}") + return DoclingDocument.model_validate_json(json_data=json_data) + except Exception as e: + return e + + @override + def convert(self) -> DoclingDocument: + if isinstance(self._doc_or_err, DoclingDocument): + return self._doc_or_err + else: + raise self._doc_or_err diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index 99d30108..d1e7ce3a 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -41,6 +41,7 @@ class InputFormat(str, Enum): MD = "md" XLSX = "xlsx" XML_USPTO = "xml_uspto" + JSON_DOCLING = "json_docling" class OutputFormat(str, Enum): @@ -62,6 +63,7 @@ class OutputFormat(str, Enum): InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"], InputFormat.XLSX: ["xlsx"], InputFormat.XML_USPTO: ["xml", "txt"], + InputFormat.JSON_DOCLING: ["json"], } FormatToMimeType: Dict[InputFormat, List[str]] = { @@ -90,6 +92,7 @@ class OutputFormat(str, Enum): "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ], InputFormat.XML_USPTO: ["application/xml", "text/plain"], + InputFormat.JSON_DOCLING: ["application/json"], } MimeTypeToFormat: dict[str, list[InputFormat]] = { diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index 136428e8..a2a93aa3 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -350,6 +350,8 @@ def _mime_from_extension(ext): mime = FormatToMimeType[InputFormat.HTML][0] elif ext in FormatToExtensions[InputFormat.MD]: mime = FormatToMimeType[InputFormat.MD][0] + elif ext in FormatToExtensions[InputFormat.JSON_DOCLING]: + mime = FormatToMimeType[InputFormat.JSON_DOCLING][0] return mime @staticmethod diff --git a/docling/document_converter.py b/docling/document_converter.py index cb073949..13203ea7 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -11,6 +11,7 @@ from docling.backend.asciidoc_backend import AsciiDocBackend from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend from docling.backend.html_backend import HTMLDocumentBackend +from docling.backend.json.docling_json_backend import DoclingJSONBackend from docling.backend.md_backend import MarkdownDocumentBackend from docling.backend.msexcel_backend import MsExcelDocumentBackend from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend @@ -136,6 +137,9 @@ def _get_default_option(format: InputFormat) -> FormatOption: InputFormat.PDF: FormatOption( pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend ), + InputFormat.JSON_DOCLING: FormatOption( + pipeline_cls=SimplePipeline, backend=DoclingJSONBackend + ), } if (options := format_to_default_options.get(format)) is not None: return options diff --git a/tests/test_backend_docling_json.py b/tests/test_backend_docling_json.py new file mode 100644 index 00000000..a38d9da1 --- /dev/null +++ b/tests/test_backend_docling_json.py @@ -0,0 +1,58 @@ +"""Test methods in module docling.backend.json.docling_json_backend.py.""" + +from io import BytesIO +from pathlib import Path + +import pytest +from pydantic import ValidationError + +from docling.backend.json.docling_json_backend import DoclingJSONBackend +from docling.datamodel.base_models import InputFormat +from docling.datamodel.document import DoclingDocument, InputDocument + +GT_PATH: Path = Path("./tests/data/groundtruth/docling_v2/2206.01062.json") + + +def test_convert_valid_docling_json(): + """Test ingestion of valid Docling JSON.""" + cls = DoclingJSONBackend + path_or_stream = GT_PATH + in_doc = InputDocument( + path_or_stream=path_or_stream, + format=InputFormat.JSON_DOCLING, + backend=cls, + ) + backend = cls( + in_doc=in_doc, + path_or_stream=path_or_stream, + ) + assert backend.is_valid() + + act_doc = backend.convert() + act_data = act_doc.export_to_dict() + + exp_doc = DoclingDocument.load_from_json(GT_PATH) + exp_data = exp_doc.export_to_dict() + + assert act_data == exp_data + + +def test_invalid_docling_json(): + """Test ingestion of invalid Docling JSON.""" + cls = DoclingJSONBackend + path_or_stream = BytesIO(b"{}") + in_doc = InputDocument( + path_or_stream=path_or_stream, + format=InputFormat.JSON_DOCLING, + backend=cls, + filename="foo", + ) + backend = cls( + in_doc=in_doc, + path_or_stream=path_or_stream, + ) + + assert not backend.is_valid() + + with pytest.raises(ValidationError): + backend.convert() diff --git a/tests/test_input_doc.py b/tests/test_input_doc.py index 8b084667..f6c516aa 100644 --- a/tests/test_input_doc.py +++ b/tests/test_input_doc.py @@ -124,6 +124,25 @@ def test_guess_format(tmp_path): doc_path.write_text("xyz", encoding="utf-8") assert dci._guess_format(doc_path) == None + # Valid Docling JSON + test_str = '{"name": ""}' + stream = DocumentStream(name="test.json", stream=BytesIO(f"{test_str}".encode())) + assert dci._guess_format(stream) == InputFormat.JSON_DOCLING + doc_path = temp_dir / "test.json" + doc_path.write_text(test_str, encoding="utf-8") + assert dci._guess_format(doc_path) == InputFormat.JSON_DOCLING + + # Non-Docling JSON + # TODO: Docling JSON is currently the single supported JSON flavor and the pipeline + # will try to validate *any* JSON (based on suffix/MIME) as Docling JSON; proper + # disambiguation seen as part of https://github.com/DS4SD/docling/issues/802 + test_str = "{}" + stream = DocumentStream(name="test.json", stream=BytesIO(f"{test_str}".encode())) + assert dci._guess_format(stream) == InputFormat.JSON_DOCLING + doc_path = temp_dir / "test.json" + doc_path.write_text(test_str, encoding="utf-8") + assert dci._guess_format(doc_path) == InputFormat.JSON_DOCLING + def _make_input_doc(path): in_doc = InputDocument(