Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add Docling JSON ingestion #783

Merged
merged 3 commits into from
Jan 24, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion docling/backend/abstract_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ def is_valid(self) -> bool:
def supports_pagination(cls) -> bool:
pass

@abstractmethod
def unload(self):
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close()
Expand Down
Empty file.
46 changes: 46 additions & 0 deletions docling/backend/json/docling_json_backend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from io import BytesIO
from pathlib import Path
from typing import Union

from docling_core.types.doc import DoclingDocument
from typing_extensions import override

from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument


class DoclingJSONBackend(DeclarativeDocumentBackend):
@override
def __init__(
self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]
) -> None:
super().__init__(in_doc, path_or_stream)
self._my_in_doc = in_doc

@override
def is_valid(self) -> bool:
return True
vagenas marked this conversation as resolved.
Show resolved Hide resolved

@classmethod
@override
def supports_pagination(cls) -> bool:
return False

@classmethod
@override
def supported_formats(cls) -> set[InputFormat]:
return {InputFormat.JSON_DOCLING}

@override
def convert(self) -> DoclingDocument:
json_data: Union[str, bytes]
if isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, encoding="utf-8") as f:
json_data = f.read()
elif isinstance(self.path_or_stream, BytesIO):
json_data = self.path_or_stream.getvalue()
else:
raise RuntimeError(f"Unexpected: {type(self.path_or_stream)=}")
doc = DoclingDocument.model_validate_json(json_data=json_data)
return doc
3 changes: 3 additions & 0 deletions docling/datamodel/base_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ class InputFormat(str, Enum):
MD = "md"
XLSX = "xlsx"
XML_USPTO = "xml_uspto"
JSON_DOCLING = "json_docling"


class OutputFormat(str, Enum):
Expand All @@ -62,6 +63,7 @@ class OutputFormat(str, Enum):
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
InputFormat.XLSX: ["xlsx"],
InputFormat.XML_USPTO: ["xml", "txt"],
InputFormat.JSON_DOCLING: ["json"],
}

FormatToMimeType: Dict[InputFormat, List[str]] = {
Expand Down Expand Up @@ -90,6 +92,7 @@ class OutputFormat(str, Enum):
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
],
InputFormat.XML_USPTO: ["application/xml", "text/plain"],
InputFormat.JSON_DOCLING: ["application/json"],
}

MimeTypeToFormat: dict[str, list[InputFormat]] = {
Expand Down
11 changes: 10 additions & 1 deletion docling/datamodel/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ def _guess_format(self, obj: Union[Path, DocumentStream]) -> Optional[InputForma
mime = mime or "text/plain"
formats = MimeTypeToFormat.get(mime, [])
if formats:
if len(formats) == 1 and mime not in ("text/plain"):
if len(formats) == 1 and mime not in ("text/plain", "application/json"):
return formats[0]
else: # ambiguity in formats
return _DocumentConversionInput._guess_from_content(
Expand Down Expand Up @@ -339,6 +339,13 @@ def _guess_from_content(
if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
input_format = InputFormat.XML_USPTO

elif mime == "application/json":
if (
InputFormat.JSON_DOCLING in formats
and '"schema_name": "DoclingDocument"' in content_str
):
input_format = InputFormat.JSON_DOCLING

vagenas marked this conversation as resolved.
Show resolved Hide resolved
return input_format

@staticmethod
Expand All @@ -350,6 +357,8 @@ def _mime_from_extension(ext):
mime = FormatToMimeType[InputFormat.HTML][0]
elif ext in FormatToExtensions[InputFormat.MD]:
mime = FormatToMimeType[InputFormat.MD][0]
elif ext in FormatToExtensions[InputFormat.JSON_DOCLING]:
mime = FormatToMimeType[InputFormat.JSON_DOCLING][0]
return mime

@staticmethod
Expand Down
4 changes: 4 additions & 0 deletions docling/document_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from docling.backend.asciidoc_backend import AsciiDocBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend
from docling.backend.json.docling_json_backend import DoclingJSONBackend
from docling.backend.md_backend import MarkdownDocumentBackend
from docling.backend.msexcel_backend import MsExcelDocumentBackend
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
Expand Down Expand Up @@ -136,6 +137,9 @@ def _get_default_option(format: InputFormat) -> FormatOption:
InputFormat.PDF: FormatOption(
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV2DocumentBackend
),
InputFormat.JSON_DOCLING: FormatOption(
pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
),
}
if (options := format_to_default_options.get(format)) is not None:
return options
Expand Down
Loading