Skip to content

Commit

Permalink
fix: improve handling of disallowed formats (#429)
Browse files Browse the repository at this point in the history
* fix: Fixes and tests for StopIteration on .convert()

Signed-off-by: Christoph Auer <[email protected]>

* fix: Remove unnecessary case handling

Signed-off-by: Christoph Auer <[email protected]>

* fix: Other test fixes

Signed-off-by: Christoph Auer <[email protected]>

* improve handling of unsupported types

- Introduced new explicit exception types instead of `RuntimeError`
- Introduced new `ConversionStatus` value for unsupported formats
- Tidied up converter member typing & removed asserts

Signed-off-by: Panos Vagenas <[email protected]>

* robustify & simplify format option resolution

Signed-off-by: Panos Vagenas <[email protected]>

* rename new status, populate ConversionResult errors

Signed-off-by: Panos Vagenas <[email protected]>

---------

Signed-off-by: Christoph Auer <[email protected]>
Signed-off-by: Panos Vagenas <[email protected]>
Co-authored-by: Panos Vagenas <[email protected]>
  • Loading branch information
cau-git and vagenas authored Dec 3, 2024
1 parent 2254845 commit 34c7c79
Show file tree
Hide file tree
Showing 6 changed files with 181 additions and 94 deletions.
2 changes: 2 additions & 0 deletions docling/datamodel/base_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ class ConversionStatus(str, Enum):
FAILURE = auto()
SUCCESS = auto()
PARTIAL_SUCCESS = auto()
SKIPPED = auto()


class InputFormat(str, Enum):
Expand Down Expand Up @@ -95,6 +96,7 @@ class DoclingComponentType(str, Enum):
DOCUMENT_BACKEND = auto()
MODEL = auto()
DOC_ASSEMBLER = auto()
USER_INPUT = auto()


class ErrorItem(BaseModel):
Expand Down
34 changes: 24 additions & 10 deletions docling/datamodel/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from enum import Enum
from io import BytesIO
from pathlib import Path, PurePath
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Type, Union
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Set, Type, Union

import filetype
from docling_core.types.doc import (
Expand Down Expand Up @@ -164,12 +164,6 @@ def _init_doc(
backend: Type[AbstractDocumentBackend],
path_or_stream: Union[BytesIO, Path],
) -> None:
if backend is None:
raise RuntimeError(
f"No backend configuration provided for file {self.file.name} with format {self.format}. "
f"Please check your format configuration on DocumentConverter."
)

self._backend = backend(self, path_or_stream=path_or_stream)
if not self._backend.is_valid():
self.valid = False
Expand Down Expand Up @@ -450,6 +444,25 @@ def make_spans(cell):
return ds_doc


class _DummyBackend(AbstractDocumentBackend):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

def is_valid(self) -> bool:
return False

@classmethod
def supported_formats(cls) -> Set[InputFormat]:
return set()

@classmethod
def supports_pagination(cls) -> bool:
return False

def unload(self):
return super().unload()


class _DocumentConversionInput(BaseModel):

path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
Expand All @@ -461,11 +474,12 @@ def docs(
for item in self.path_or_stream_iterator:
obj = resolve_source_to_stream(item) if isinstance(item, str) else item
format = self._guess_format(obj)
backend: Type[AbstractDocumentBackend]
if format not in format_options.keys():
_log.info(
f"Skipping input document {obj.name} because it isn't matching any of the allowed formats."
_log.error(
f"Input document {obj.name} does not match any allowed format."
)
continue
backend = _DummyBackend
else:
backend = format_options[format].backend

Expand Down
186 changes: 103 additions & 83 deletions docling/document_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,21 @@
from docling.backend.msexcel_backend import MsExcelDocumentBackend
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
from docling.datamodel.base_models import (
ConversionStatus,
DoclingComponentType,
DocumentStream,
ErrorItem,
InputFormat,
)
from docling.datamodel.document import (
ConversionResult,
InputDocument,
_DocumentConversionInput,
)
from docling.datamodel.pipeline_options import PipelineOptions
from docling.datamodel.settings import DocumentLimits, settings
from docling.exceptions import ConversionError
from docling.pipeline.base_pipeline import BasePipeline
from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
Expand Down Expand Up @@ -85,32 +92,37 @@ class ImageFormatOption(FormatOption):
backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend


_format_to_default_options = {
InputFormat.XLSX: FormatOption(
pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
),
InputFormat.DOCX: FormatOption(
pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
),
InputFormat.PPTX: FormatOption(
pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
),
InputFormat.MD: FormatOption(
pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
),
InputFormat.ASCIIDOC: FormatOption(
pipeline_cls=SimplePipeline, backend=AsciiDocBackend
),
InputFormat.HTML: FormatOption(
pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
),
InputFormat.IMAGE: FormatOption(
pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
),
InputFormat.PDF: FormatOption(
pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
),
}
def _get_default_option(format: InputFormat) -> FormatOption:
format_to_default_options = {
InputFormat.XLSX: FormatOption(
pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
),
InputFormat.DOCX: FormatOption(
pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
),
InputFormat.PPTX: FormatOption(
pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
),
InputFormat.MD: FormatOption(
pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
),
InputFormat.ASCIIDOC: FormatOption(
pipeline_cls=SimplePipeline, backend=AsciiDocBackend
),
InputFormat.HTML: FormatOption(
pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
),
InputFormat.IMAGE: FormatOption(
pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
),
InputFormat.PDF: FormatOption(
pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
),
}
if (options := format_to_default_options.get(format)) is not None:
return options
else:
raise RuntimeError(f"No default options configured for {format}")


class DocumentConverter:
Expand All @@ -121,36 +133,26 @@ def __init__(
allowed_formats: Optional[List[InputFormat]] = None,
format_options: Optional[Dict[InputFormat, FormatOption]] = None,
):
self.allowed_formats = allowed_formats
self.format_to_options = format_options

if self.allowed_formats is None:
# if self.format_to_options is not None:
# self.allowed_formats = self.format_to_options.keys()
# else:
self.allowed_formats = [e for e in InputFormat] # all formats

if self.format_to_options is None:
self.format_to_options = _format_to_default_options
else:
for f in self.allowed_formats:
if f not in self.format_to_options.keys():
_log.debug(f"Requested format {f} will use default options.")
self.format_to_options[f] = _format_to_default_options[f]

remove_keys = []
for f in self.format_to_options.keys():
if f not in self.allowed_formats:
remove_keys.append(f)

for f in remove_keys:
self.format_to_options.pop(f)

self.allowed_formats = (
allowed_formats if allowed_formats is not None else [e for e in InputFormat]
)
self.format_to_options = {
format: (
_get_default_option(format=format)
if (custom_option := (format_options or {}).get(format)) is None
else custom_option
)
for format in self.allowed_formats
}
self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}

def initialize_pipeline(self, format: InputFormat):
"""Initialize the conversion pipeline for the selected format."""
self._get_pipeline(doc_format=format)
pipeline = self._get_pipeline(doc_format=format)
if pipeline is None:
raise ConversionError(
f"No pipeline could be initialized for format {format}"
)

@validate_call(config=ConfigDict(strict=True))
def convert(
Expand Down Expand Up @@ -186,22 +188,28 @@ def convert_all(
limits=limits,
)
conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)

had_result = False
for conv_res in conv_res_iter:
had_result = True
if raises_on_error and conv_res.status not in {
ConversionStatus.SUCCESS,
ConversionStatus.PARTIAL_SUCCESS,
}:
raise RuntimeError(
raise ConversionError(
f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
)
else:
yield conv_res

if not had_result and raises_on_error:
raise ConversionError(
f"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
)

def _convert(
self, conv_input: _DocumentConversionInput, raises_on_error: bool
) -> Iterator[ConversionResult]:
assert self.format_to_options is not None

start_time = time.monotonic()

for input_batch in chunkify(
Expand All @@ -223,27 +231,22 @@ def _convert(
):
elapsed = time.monotonic() - start_time
start_time = time.monotonic()

if item is not None:
_log.info(
f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
)
yield item
else:
_log.info(f"Skipped a document. We lost {elapsed:.2f} sec.")
_log.info(
f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
)
yield item

def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
assert self.format_to_options is not None

fopt = self.format_to_options.get(doc_format)

if fopt is None:
raise RuntimeError(f"Could not get pipeline for {doc_format}")
return None
else:
pipeline_class = fopt.pipeline_cls
pipeline_options = fopt.pipeline_options

assert pipeline_options is not None
if pipeline_options is None:
return None
# TODO this will ignore if different options have been defined for the same pipeline class.
if (
pipeline_class not in self.initialized_pipelines
Expand All @@ -257,11 +260,26 @@ def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:

def _process_document(
self, in_doc: InputDocument, raises_on_error: bool
) -> Optional[ConversionResult]:
assert self.allowed_formats is not None
assert in_doc.format in self.allowed_formats
) -> ConversionResult:

conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
valid = (
self.allowed_formats is not None and in_doc.format in self.allowed_formats
)
if valid:
conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
else:
error_message = f"File format not allowed: {in_doc.file}"
if raises_on_error:
raise ConversionError(error_message)
else:
error_item = ErrorItem(
component_type=DoclingComponentType.USER_INPUT,
module_name="",
error_message=error_message,
)
conv_res = ConversionResult(
input=in_doc, status=ConversionStatus.SKIPPED, errors=[error_item]
)

return conv_res

Expand All @@ -270,26 +288,28 @@ def _execute_pipeline(
) -> ConversionResult:
if in_doc.valid:
pipeline = self._get_pipeline(in_doc.format)
if pipeline is None: # Can't find a default pipeline. Should this raise?
if pipeline is not None:
conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
else:
if raises_on_error:
raise RuntimeError(
raise ConversionError(
f"No pipeline could be initialized for {in_doc.file}."
)
else:
conv_res = ConversionResult(input=in_doc)
conv_res.status = ConversionStatus.FAILURE
return conv_res

conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)

conv_res = ConversionResult(
input=in_doc,
status=ConversionStatus.FAILURE,
)
else:
if raises_on_error:
raise RuntimeError(f"Input document {in_doc.file} is not valid.")
raise ConversionError(f"Input document {in_doc.file} is not valid.")

else:
# invalid doc or not of desired format
conv_res = ConversionResult(input=in_doc)
conv_res.status = ConversionStatus.FAILURE
conv_res = ConversionResult(
input=in_doc,
status=ConversionStatus.FAILURE,
)
# TODO add error log why it failed.

return conv_res
6 changes: 6 additions & 0 deletions docling/exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
class BaseError(RuntimeError):
pass


class ConversionError(BaseError):
pass
2 changes: 1 addition & 1 deletion tests/test_interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2

GENERATE = True
GENERATE = False


def get_pdf_path():
Expand Down
Loading

0 comments on commit 34c7c79

Please sign in to comment.