Skip to content

Commit

Permalink
update conversion as per review comments, add tests, revert Docling J…
Browse files Browse the repository at this point in the history
…SON disambiguation, document intricacies

Signed-off-by: Panos Vagenas <[email protected]>
  • Loading branch information
vagenas committed Jan 24, 2025
1 parent e972c9c commit 9cdb176
Show file tree
Hide file tree
Showing 4 changed files with 102 additions and 20 deletions.
36 changes: 24 additions & 12 deletions docling/backend/json/docling_json_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,15 @@ def __init__(
self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]
) -> None:
super().__init__(in_doc, path_or_stream)
self._my_in_doc = in_doc

# given we need to store any actual conversion exception for raising it from
# convert(), this captures the successful result or the actual error in a
# mutually exclusive way:
self._doc_or_err = self._get_doc_or_err()

@override
def is_valid(self) -> bool:
return True
return isinstance(self._doc_or_err, DoclingDocument)

@classmethod
@override
Expand All @@ -32,15 +36,23 @@ def supports_pagination(cls) -> bool:
def supported_formats(cls) -> set[InputFormat]:
return {InputFormat.JSON_DOCLING}

def _get_doc_or_err(self) -> Union[DoclingDocument, Exception]:
try:
json_data: Union[str, bytes]
if isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, encoding="utf-8") as f:
json_data = f.read()
elif isinstance(self.path_or_stream, BytesIO):
json_data = self.path_or_stream.getvalue()
else:
raise RuntimeError(f"Unexpected: {type(self.path_or_stream)=}")
return DoclingDocument.model_validate_json(json_data=json_data)
except Exception as e:
return e

@override
def convert(self) -> DoclingDocument:
json_data: Union[str, bytes]
if isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, encoding="utf-8") as f:
json_data = f.read()
elif isinstance(self.path_or_stream, BytesIO):
json_data = self.path_or_stream.getvalue()
def convert(self):
if isinstance(self._doc_or_err, DoclingDocument):
return self._doc_or_err
else:
raise RuntimeError(f"Unexpected: {type(self.path_or_stream)=}")
doc = DoclingDocument.model_validate_json(json_data=json_data)
return doc
raise self._doc_or_err
9 changes: 1 addition & 8 deletions docling/datamodel/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ def _guess_format(self, obj: Union[Path, DocumentStream]) -> Optional[InputForma
mime = mime or "text/plain"
formats = MimeTypeToFormat.get(mime, [])
if formats:
if len(formats) == 1 and mime not in ("text/plain", "application/json"):
if len(formats) == 1 and mime not in ("text/plain"):
return formats[0]
else: # ambiguity in formats
return _DocumentConversionInput._guess_from_content(
Expand Down Expand Up @@ -339,13 +339,6 @@ def _guess_from_content(
if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
input_format = InputFormat.XML_USPTO

elif mime == "application/json":
if (
InputFormat.JSON_DOCLING in formats
and '"schema_name": "DoclingDocument"' in content_str
):
input_format = InputFormat.JSON_DOCLING

return input_format

@staticmethod
Expand Down
58 changes: 58 additions & 0 deletions tests/test_backend_docling_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
"""Test methods in module docling.backend.json.docling_json_backend.py."""

from io import BytesIO
from pathlib import Path

import pytest
from pydantic import ValidationError

from docling.backend.json.docling_json_backend import DoclingJSONBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import DoclingDocument, InputDocument

GT_PATH: Path = Path("./tests/data/groundtruth/docling_v2/2206.01062.json")


def test_convert_valid_docling_json():
"""Test ingestion of valid Docling JSON."""
cls = DoclingJSONBackend
path_or_stream = GT_PATH
in_doc = InputDocument(
path_or_stream=path_or_stream,
format=InputFormat.JSON_DOCLING,
backend=cls,
)
backend = cls(
in_doc=in_doc,
path_or_stream=path_or_stream,
)
assert backend.is_valid()

act_doc = backend.convert()
act_data = act_doc.export_to_dict()

exp_doc = DoclingDocument.load_from_json(GT_PATH)
exp_data = exp_doc.export_to_dict()

assert act_data == exp_data


def test_invalid_docling_json():
"""Test ingestion of invalid Docling JSON."""
cls = DoclingJSONBackend
path_or_stream = BytesIO(b"{}")
in_doc = InputDocument(
path_or_stream=path_or_stream,
format=InputFormat.JSON_DOCLING,
backend=cls,
filename="foo",
)
backend = cls(
in_doc=in_doc,
path_or_stream=path_or_stream,
)

assert not backend.is_valid()

with pytest.raises(ValidationError):
backend.convert()
19 changes: 19 additions & 0 deletions tests/test_input_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,25 @@ def test_guess_format(tmp_path):
doc_path.write_text("xyz", encoding="utf-8")
assert dci._guess_format(doc_path) == None

# Valid Docling JSON
test_str = '{"name": ""}'
stream = DocumentStream(name="test.json", stream=BytesIO(f"{test_str}".encode()))
assert dci._guess_format(stream) == InputFormat.JSON_DOCLING
doc_path = temp_dir / "test.json"
doc_path.write_text(test_str, encoding="utf-8")
assert dci._guess_format(doc_path) == InputFormat.JSON_DOCLING

# Non-Docling JSON
# TODO: Docling JSON is currently the single supported JSON flavor and the pipeline
# will try to validate *any* JSON (based on suffix/MIME) as Docling JSON; proper
# disambiguation seen as part of https://github.com/DS4SD/docling/issues/802
test_str = "{}"
stream = DocumentStream(name="test.json", stream=BytesIO(f"{test_str}".encode()))
assert dci._guess_format(stream) == InputFormat.JSON_DOCLING
doc_path = temp_dir / "test.json"
doc_path.write_text(test_str, encoding="utf-8")
assert dci._guess_format(doc_path) == InputFormat.JSON_DOCLING


def _make_input_doc(path):
in_doc = InputDocument(
Expand Down

0 comments on commit 9cdb176

Please sign in to comment.