From 34c7c798580476a86ce8abec30b1115fbb36fdd8 Mon Sep 17 00:00:00 2001 From: Christoph Auer <60343111+cau-git@users.noreply.github.com> Date: Tue, 3 Dec 2024 12:45:32 +0100 Subject: [PATCH] fix: improve handling of disallowed formats (#429) * fix: Fixes and tests for StopIteration on .convert() Signed-off-by: Christoph Auer * fix: Remove unnecessary case handling Signed-off-by: Christoph Auer * fix: Other test fixes Signed-off-by: Christoph Auer * improve handling of unsupported types - Introduced new explicit exception types instead of `RuntimeError` - Introduced new `ConversionStatus` value for unsupported formats - Tidied up converter member typing & removed asserts Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> * robustify & simplify format option resolution Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> * rename new status, populate ConversionResult errors Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --------- Signed-off-by: Christoph Auer Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Co-authored-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --- docling/datamodel/base_models.py | 2 + docling/datamodel/document.py | 34 ++++-- docling/document_converter.py | 186 +++++++++++++++++-------------- docling/exceptions.py | 6 + tests/test_interfaces.py | 2 +- tests/test_invalid_input.py | 45 ++++++++ 6 files changed, 181 insertions(+), 94 deletions(-) create mode 100644 docling/exceptions.py create mode 100644 tests/test_invalid_input.py diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index 8e584f4b..55a19ac3 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -24,6 +24,7 @@ class ConversionStatus(str, Enum): FAILURE = auto() SUCCESS = auto() PARTIAL_SUCCESS = auto() + SKIPPED = auto() class InputFormat(str, Enum): @@ -95,6 +96,7 @@ class DoclingComponentType(str, Enum): DOCUMENT_BACKEND = auto() MODEL = auto() DOC_ASSEMBLER = auto() + USER_INPUT = auto() class ErrorItem(BaseModel): diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index 2fadb7f9..e5b49343 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -3,7 +3,7 @@ from enum import Enum from io import BytesIO from pathlib import Path, PurePath -from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Type, Union +from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Set, Type, Union import filetype from docling_core.types.doc import ( @@ -164,12 +164,6 @@ def _init_doc( backend: Type[AbstractDocumentBackend], path_or_stream: Union[BytesIO, Path], ) -> None: - if backend is None: - raise RuntimeError( - f"No backend configuration provided for file {self.file.name} with format {self.format}. " - f"Please check your format configuration on DocumentConverter." - ) - self._backend = backend(self, path_or_stream=path_or_stream) if not self._backend.is_valid(): self.valid = False @@ -450,6 +444,25 @@ def make_spans(cell): return ds_doc +class _DummyBackend(AbstractDocumentBackend): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def is_valid(self) -> bool: + return False + + @classmethod + def supported_formats(cls) -> Set[InputFormat]: + return set() + + @classmethod + def supports_pagination(cls) -> bool: + return False + + def unload(self): + return super().unload() + + class _DocumentConversionInput(BaseModel): path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]] @@ -461,11 +474,12 @@ def docs( for item in self.path_or_stream_iterator: obj = resolve_source_to_stream(item) if isinstance(item, str) else item format = self._guess_format(obj) + backend: Type[AbstractDocumentBackend] if format not in format_options.keys(): - _log.info( - f"Skipping input document {obj.name} because it isn't matching any of the allowed formats." + _log.error( + f"Input document {obj.name} does not match any allowed format." ) - continue + backend = _DummyBackend else: backend = format_options[format].backend diff --git a/docling/document_converter.py b/docling/document_converter.py index 74e6f84a..503a4c5b 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -15,7 +15,13 @@ from docling.backend.msexcel_backend import MsExcelDocumentBackend from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend from docling.backend.msword_backend import MsWordDocumentBackend -from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat +from docling.datamodel.base_models import ( + ConversionStatus, + DoclingComponentType, + DocumentStream, + ErrorItem, + InputFormat, +) from docling.datamodel.document import ( ConversionResult, InputDocument, @@ -23,6 +29,7 @@ ) from docling.datamodel.pipeline_options import PipelineOptions from docling.datamodel.settings import DocumentLimits, settings +from docling.exceptions import ConversionError from docling.pipeline.base_pipeline import BasePipeline from docling.pipeline.simple_pipeline import SimplePipeline from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline @@ -85,32 +92,37 @@ class ImageFormatOption(FormatOption): backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend -_format_to_default_options = { - InputFormat.XLSX: FormatOption( - pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend - ), - InputFormat.DOCX: FormatOption( - pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend - ), - InputFormat.PPTX: FormatOption( - pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend - ), - InputFormat.MD: FormatOption( - pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend - ), - InputFormat.ASCIIDOC: FormatOption( - pipeline_cls=SimplePipeline, backend=AsciiDocBackend - ), - InputFormat.HTML: FormatOption( - pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend - ), - InputFormat.IMAGE: FormatOption( - pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend - ), - InputFormat.PDF: FormatOption( - pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend - ), -} +def _get_default_option(format: InputFormat) -> FormatOption: + format_to_default_options = { + InputFormat.XLSX: FormatOption( + pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend + ), + InputFormat.DOCX: FormatOption( + pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend + ), + InputFormat.PPTX: FormatOption( + pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend + ), + InputFormat.MD: FormatOption( + pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend + ), + InputFormat.ASCIIDOC: FormatOption( + pipeline_cls=SimplePipeline, backend=AsciiDocBackend + ), + InputFormat.HTML: FormatOption( + pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend + ), + InputFormat.IMAGE: FormatOption( + pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend + ), + InputFormat.PDF: FormatOption( + pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend + ), + } + if (options := format_to_default_options.get(format)) is not None: + return options + else: + raise RuntimeError(f"No default options configured for {format}") class DocumentConverter: @@ -121,36 +133,26 @@ def __init__( allowed_formats: Optional[List[InputFormat]] = None, format_options: Optional[Dict[InputFormat, FormatOption]] = None, ): - self.allowed_formats = allowed_formats - self.format_to_options = format_options - - if self.allowed_formats is None: - # if self.format_to_options is not None: - # self.allowed_formats = self.format_to_options.keys() - # else: - self.allowed_formats = [e for e in InputFormat] # all formats - - if self.format_to_options is None: - self.format_to_options = _format_to_default_options - else: - for f in self.allowed_formats: - if f not in self.format_to_options.keys(): - _log.debug(f"Requested format {f} will use default options.") - self.format_to_options[f] = _format_to_default_options[f] - - remove_keys = [] - for f in self.format_to_options.keys(): - if f not in self.allowed_formats: - remove_keys.append(f) - - for f in remove_keys: - self.format_to_options.pop(f) - + self.allowed_formats = ( + allowed_formats if allowed_formats is not None else [e for e in InputFormat] + ) + self.format_to_options = { + format: ( + _get_default_option(format=format) + if (custom_option := (format_options or {}).get(format)) is None + else custom_option + ) + for format in self.allowed_formats + } self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {} def initialize_pipeline(self, format: InputFormat): """Initialize the conversion pipeline for the selected format.""" - self._get_pipeline(doc_format=format) + pipeline = self._get_pipeline(doc_format=format) + if pipeline is None: + raise ConversionError( + f"No pipeline could be initialized for format {format}" + ) @validate_call(config=ConfigDict(strict=True)) def convert( @@ -186,22 +188,28 @@ def convert_all( limits=limits, ) conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error) + + had_result = False for conv_res in conv_res_iter: + had_result = True if raises_on_error and conv_res.status not in { ConversionStatus.SUCCESS, ConversionStatus.PARTIAL_SUCCESS, }: - raise RuntimeError( + raise ConversionError( f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}" ) else: yield conv_res + if not had_result and raises_on_error: + raise ConversionError( + f"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats." + ) + def _convert( self, conv_input: _DocumentConversionInput, raises_on_error: bool ) -> Iterator[ConversionResult]: - assert self.format_to_options is not None - start_time = time.monotonic() for input_batch in chunkify( @@ -223,27 +231,22 @@ def _convert( ): elapsed = time.monotonic() - start_time start_time = time.monotonic() - - if item is not None: - _log.info( - f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec." - ) - yield item - else: - _log.info(f"Skipped a document. We lost {elapsed:.2f} sec.") + _log.info( + f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec." + ) + yield item def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]: - assert self.format_to_options is not None - fopt = self.format_to_options.get(doc_format) if fopt is None: - raise RuntimeError(f"Could not get pipeline for {doc_format}") + return None else: pipeline_class = fopt.pipeline_cls pipeline_options = fopt.pipeline_options - assert pipeline_options is not None + if pipeline_options is None: + return None # TODO this will ignore if different options have been defined for the same pipeline class. if ( pipeline_class not in self.initialized_pipelines @@ -257,11 +260,26 @@ def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]: def _process_document( self, in_doc: InputDocument, raises_on_error: bool - ) -> Optional[ConversionResult]: - assert self.allowed_formats is not None - assert in_doc.format in self.allowed_formats + ) -> ConversionResult: - conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error) + valid = ( + self.allowed_formats is not None and in_doc.format in self.allowed_formats + ) + if valid: + conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error) + else: + error_message = f"File format not allowed: {in_doc.file}" + if raises_on_error: + raise ConversionError(error_message) + else: + error_item = ErrorItem( + component_type=DoclingComponentType.USER_INPUT, + module_name="", + error_message=error_message, + ) + conv_res = ConversionResult( + input=in_doc, status=ConversionStatus.SKIPPED, errors=[error_item] + ) return conv_res @@ -270,26 +288,28 @@ def _execute_pipeline( ) -> ConversionResult: if in_doc.valid: pipeline = self._get_pipeline(in_doc.format) - if pipeline is None: # Can't find a default pipeline. Should this raise? + if pipeline is not None: + conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error) + else: if raises_on_error: - raise RuntimeError( + raise ConversionError( f"No pipeline could be initialized for {in_doc.file}." ) else: - conv_res = ConversionResult(input=in_doc) - conv_res.status = ConversionStatus.FAILURE - return conv_res - - conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error) - + conv_res = ConversionResult( + input=in_doc, + status=ConversionStatus.FAILURE, + ) else: if raises_on_error: - raise RuntimeError(f"Input document {in_doc.file} is not valid.") + raise ConversionError(f"Input document {in_doc.file} is not valid.") else: # invalid doc or not of desired format - conv_res = ConversionResult(input=in_doc) - conv_res.status = ConversionStatus.FAILURE + conv_res = ConversionResult( + input=in_doc, + status=ConversionStatus.FAILURE, + ) # TODO add error log why it failed. return conv_res diff --git a/docling/exceptions.py b/docling/exceptions.py new file mode 100644 index 00000000..13145b9c --- /dev/null +++ b/docling/exceptions.py @@ -0,0 +1,6 @@ +class BaseError(RuntimeError): + pass + + +class ConversionError(BaseError): + pass diff --git a/tests/test_interfaces.py b/tests/test_interfaces.py index a6675846..23bc3345 100644 --- a/tests/test_interfaces.py +++ b/tests/test_interfaces.py @@ -10,7 +10,7 @@ from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2 -GENERATE = True +GENERATE = False def get_pdf_path(): diff --git a/tests/test_invalid_input.py b/tests/test_invalid_input.py new file mode 100644 index 00000000..f40d79e4 --- /dev/null +++ b/tests/test_invalid_input.py @@ -0,0 +1,45 @@ +from io import BytesIO +from pathlib import Path + +import pytest + +from docling.datamodel.base_models import ConversionStatus, DocumentStream +from docling.document_converter import ConversionError, DocumentConverter + + +def get_pdf_path(): + + pdf_path = Path("./tests/data/2305.03393v1-pg9.pdf") + return pdf_path + + +@pytest.fixture +def converter(): + converter = DocumentConverter() + + return converter + + +def test_convert_unsupported_doc_format_wout_exception(converter: DocumentConverter): + result = converter.convert( + DocumentStream(name="input.xyz", stream=BytesIO(b"xyz")), raises_on_error=False + ) + assert result.status == ConversionStatus.SKIPPED + + +def test_convert_unsupported_doc_format_with_exception(converter: DocumentConverter): + with pytest.raises(ConversionError): + converter.convert( + DocumentStream(name="input.xyz", stream=BytesIO(b"xyz")), + raises_on_error=True, + ) + + +def test_convert_too_small_filesize_limit_wout_exception(converter: DocumentConverter): + result = converter.convert(get_pdf_path(), max_file_size=1, raises_on_error=False) + assert result.status == ConversionStatus.FAILURE + + +def test_convert_too_small_filesize_limit_with_exception(converter: DocumentConverter): + with pytest.raises(ConversionError): + converter.convert(get_pdf_path(), max_file_size=1, raises_on_error=True)