Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add Docling JSON ingestion #783

Merged
merged 3 commits into from
Jan 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion docling/backend/abstract_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ def is_valid(self) -> bool:
def supports_pagination(cls) -> bool:
pass

@abstractmethod
def unload(self):
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close()
Expand Down
Empty file.
58 changes: 58 additions & 0 deletions docling/backend/json/docling_json_backend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from io import BytesIO
from pathlib import Path
from typing import Union

from docling_core.types.doc import DoclingDocument
from typing_extensions import override

from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument


class DoclingJSONBackend(DeclarativeDocumentBackend):
@override
def __init__(
self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]
) -> None:
super().__init__(in_doc, path_or_stream)

# given we need to store any actual conversion exception for raising it from
# convert(), this captures the successful result or the actual error in a
# mutually exclusive way:
self._doc_or_err = self._get_doc_or_err()

@override
def is_valid(self) -> bool:
return isinstance(self._doc_or_err, DoclingDocument)

@classmethod
@override
def supports_pagination(cls) -> bool:
return False

@classmethod
@override
def supported_formats(cls) -> set[InputFormat]:
return {InputFormat.JSON_DOCLING}

def _get_doc_or_err(self) -> Union[DoclingDocument, Exception]:
try:
json_data: Union[str, bytes]
if isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, encoding="utf-8") as f:
json_data = f.read()
elif isinstance(self.path_or_stream, BytesIO):
json_data = self.path_or_stream.getvalue()
else:
raise RuntimeError(f"Unexpected: {type(self.path_or_stream)=}")
return DoclingDocument.model_validate_json(json_data=json_data)
except Exception as e:
return e

@override
def convert(self) -> DoclingDocument:
if isinstance(self._doc_or_err, DoclingDocument):
return self._doc_or_err
else:
raise self._doc_or_err
3 changes: 3 additions & 0 deletions docling/datamodel/base_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ class InputFormat(str, Enum):
MD = "md"
XLSX = "xlsx"
XML_USPTO = "xml_uspto"
JSON_DOCLING = "json_docling"


class OutputFormat(str, Enum):
Expand All @@ -62,6 +63,7 @@ class OutputFormat(str, Enum):
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
InputFormat.XLSX: ["xlsx"],
InputFormat.XML_USPTO: ["xml", "txt"],
InputFormat.JSON_DOCLING: ["json"],
}

FormatToMimeType: Dict[InputFormat, List[str]] = {
Expand Down Expand Up @@ -90,6 +92,7 @@ class OutputFormat(str, Enum):
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
],
InputFormat.XML_USPTO: ["application/xml", "text/plain"],
InputFormat.JSON_DOCLING: ["application/json"],
}

MimeTypeToFormat: dict[str, list[InputFormat]] = {
Expand Down
2 changes: 2 additions & 0 deletions docling/datamodel/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,8 @@ def _mime_from_extension(ext):
mime = FormatToMimeType[InputFormat.HTML][0]
elif ext in FormatToExtensions[InputFormat.MD]:
mime = FormatToMimeType[InputFormat.MD][0]
elif ext in FormatToExtensions[InputFormat.JSON_DOCLING]:
mime = FormatToMimeType[InputFormat.JSON_DOCLING][0]
return mime

@staticmethod
Expand Down
4 changes: 4 additions & 0 deletions docling/document_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from docling.backend.asciidoc_backend import AsciiDocBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend
from docling.backend.json.docling_json_backend import DoclingJSONBackend
from docling.backend.md_backend import MarkdownDocumentBackend
from docling.backend.msexcel_backend import MsExcelDocumentBackend
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
Expand Down Expand Up @@ -136,6 +137,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
InputFormat.PDF: FormatOption(
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
),
InputFormat.JSON_DOCLING: FormatOption(
pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
),
}
if (options := format_to_default_options.get(format)) is not None:
return options
Expand Down
58 changes: 58 additions & 0 deletions tests/test_backend_docling_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
"""Test methods in module docling.backend.json.docling_json_backend.py."""

from io import BytesIO
from pathlib import Path

import pytest
from pydantic import ValidationError

from docling.backend.json.docling_json_backend import DoclingJSONBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import DoclingDocument, InputDocument

GT_PATH: Path = Path("./tests/data/groundtruth/docling_v2/2206.01062.json")


def test_convert_valid_docling_json():
"""Test ingestion of valid Docling JSON."""
cls = DoclingJSONBackend
path_or_stream = GT_PATH
in_doc = InputDocument(
path_or_stream=path_or_stream,
format=InputFormat.JSON_DOCLING,
backend=cls,
)
backend = cls(
in_doc=in_doc,
path_or_stream=path_or_stream,
)
assert backend.is_valid()

act_doc = backend.convert()
act_data = act_doc.export_to_dict()

exp_doc = DoclingDocument.load_from_json(GT_PATH)
exp_data = exp_doc.export_to_dict()

assert act_data == exp_data


def test_invalid_docling_json():
"""Test ingestion of invalid Docling JSON."""
cls = DoclingJSONBackend
path_or_stream = BytesIO(b"{}")
in_doc = InputDocument(
path_or_stream=path_or_stream,
format=InputFormat.JSON_DOCLING,
backend=cls,
filename="foo",
)
backend = cls(
in_doc=in_doc,
path_or_stream=path_or_stream,
)

assert not backend.is_valid()

with pytest.raises(ValidationError):
backend.convert()
19 changes: 19 additions & 0 deletions tests/test_input_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,25 @@ def test_guess_format(tmp_path):
doc_path.write_text("xyz", encoding="utf-8")
assert dci._guess_format(doc_path) == None

# Valid Docling JSON
test_str = '{"name": ""}'
stream = DocumentStream(name="test.json", stream=BytesIO(f"{test_str}".encode()))
assert dci._guess_format(stream) == InputFormat.JSON_DOCLING
doc_path = temp_dir / "test.json"
doc_path.write_text(test_str, encoding="utf-8")
assert dci._guess_format(doc_path) == InputFormat.JSON_DOCLING

# Non-Docling JSON
# TODO: Docling JSON is currently the single supported JSON flavor and the pipeline
# will try to validate *any* JSON (based on suffix/MIME) as Docling JSON; proper
# disambiguation seen as part of https://github.com/DS4SD/docling/issues/802
test_str = "{}"
stream = DocumentStream(name="test.json", stream=BytesIO(f"{test_str}".encode()))
assert dci._guess_format(stream) == InputFormat.JSON_DOCLING
doc_path = temp_dir / "test.json"
doc_path.write_text(test_str, encoding="utf-8")
assert dci._guess_format(doc_path) == InputFormat.JSON_DOCLING


def _make_input_doc(path):
in_doc = InputDocument(
Expand Down
Loading