From 34c7c798580476a86ce8abec30b1115fbb36fdd8 Mon Sep 17 00:00:00 2001
From: Christoph Auer <60343111+cau-git@users.noreply.github.com>
Date: Tue, 3 Dec 2024 12:45:32 +0100
Subject: [PATCH] fix: improve handling of disallowed formats (#429)

* fix: Fixes and tests for StopIteration on .convert()

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* fix: Remove unnecessary case handling

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* fix: Other test fixes

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* improve handling of unsupported types

- Introduced new explicit exception types instead of `RuntimeError`
- Introduced new `ConversionStatus` value for unsupported formats
- Tidied up converter member typing & removed asserts

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>

* robustify & simplify format option resolution

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>

* rename new status, populate ConversionResult errors

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
Co-authored-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
---
 docling/datamodel/base_models.py |   2 +
 docling/datamodel/document.py    |  34 ++++--
 docling/document_converter.py    | 186 +++++++++++++++++--------------
 docling/exceptions.py            |   6 +
 tests/test_interfaces.py         |   2 +-
 tests/test_invalid_input.py      |  45 ++++++++
 6 files changed, 181 insertions(+), 94 deletions(-)
 create mode 100644 docling/exceptions.py
 create mode 100644 tests/test_invalid_input.py

diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py
index 8e584f4b..55a19ac3 100644
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@@ -24,6 +24,7 @@ class ConversionStatus(str, Enum):
     FAILURE = auto()
     SUCCESS = auto()
     PARTIAL_SUCCESS = auto()
+    SKIPPED = auto()
 
 
 class InputFormat(str, Enum):
@@ -95,6 +96,7 @@ class DoclingComponentType(str, Enum):
     DOCUMENT_BACKEND = auto()
     MODEL = auto()
     DOC_ASSEMBLER = auto()
+    USER_INPUT = auto()
 
 
 class ErrorItem(BaseModel):
diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py
index 2fadb7f9..e5b49343 100644
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@@ -3,7 +3,7 @@
 from enum import Enum
 from io import BytesIO
 from pathlib import Path, PurePath
-from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Type, Union
+from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Set, Type, Union
 
 import filetype
 from docling_core.types.doc import (
@@ -164,12 +164,6 @@ def _init_doc(
         backend: Type[AbstractDocumentBackend],
         path_or_stream: Union[BytesIO, Path],
     ) -> None:
-        if backend is None:
-            raise RuntimeError(
-                f"No backend configuration provided for file {self.file.name} with format {self.format}. "
-                f"Please check your format configuration on DocumentConverter."
-            )
-
         self._backend = backend(self, path_or_stream=path_or_stream)
         if not self._backend.is_valid():
             self.valid = False
@@ -450,6 +444,25 @@ def make_spans(cell):
         return ds_doc
 
 
+class _DummyBackend(AbstractDocumentBackend):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def is_valid(self) -> bool:
+        return False
+
+    @classmethod
+    def supported_formats(cls) -> Set[InputFormat]:
+        return set()
+
+    @classmethod
+    def supports_pagination(cls) -> bool:
+        return False
+
+    def unload(self):
+        return super().unload()
+
+
 class _DocumentConversionInput(BaseModel):
 
     path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
@@ -461,11 +474,12 @@ def docs(
         for item in self.path_or_stream_iterator:
             obj = resolve_source_to_stream(item) if isinstance(item, str) else item
             format = self._guess_format(obj)
+            backend: Type[AbstractDocumentBackend]
             if format not in format_options.keys():
-                _log.info(
-                    f"Skipping input document {obj.name} because it isn't matching any of the allowed formats."
+                _log.error(
+                    f"Input document {obj.name} does not match any allowed format."
                 )
-                continue
+                backend = _DummyBackend
             else:
                 backend = format_options[format].backend
 
diff --git a/docling/document_converter.py b/docling/document_converter.py
index 74e6f84a..503a4c5b 100644
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@@ -15,7 +15,13 @@
 from docling.backend.msexcel_backend import MsExcelDocumentBackend
 from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
 from docling.backend.msword_backend import MsWordDocumentBackend
-from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
+from docling.datamodel.base_models import (
+    ConversionStatus,
+    DoclingComponentType,
+    DocumentStream,
+    ErrorItem,
+    InputFormat,
+)
 from docling.datamodel.document import (
     ConversionResult,
     InputDocument,
@@ -23,6 +29,7 @@
 )
 from docling.datamodel.pipeline_options import PipelineOptions
 from docling.datamodel.settings import DocumentLimits, settings
+from docling.exceptions import ConversionError
 from docling.pipeline.base_pipeline import BasePipeline
 from docling.pipeline.simple_pipeline import SimplePipeline
 from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
@@ -85,32 +92,37 @@ class ImageFormatOption(FormatOption):
     backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
 
 
-_format_to_default_options = {
-    InputFormat.XLSX: FormatOption(
-        pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
-    ),
-    InputFormat.DOCX: FormatOption(
-        pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
-    ),
-    InputFormat.PPTX: FormatOption(
-        pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
-    ),
-    InputFormat.MD: FormatOption(
-        pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
-    ),
-    InputFormat.ASCIIDOC: FormatOption(
-        pipeline_cls=SimplePipeline, backend=AsciiDocBackend
-    ),
-    InputFormat.HTML: FormatOption(
-        pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
-    ),
-    InputFormat.IMAGE: FormatOption(
-        pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
-    ),
-    InputFormat.PDF: FormatOption(
-        pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
-    ),
-}
+def _get_default_option(format: InputFormat) -> FormatOption:
+    format_to_default_options = {
+        InputFormat.XLSX: FormatOption(
+            pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
+        ),
+        InputFormat.DOCX: FormatOption(
+            pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
+        ),
+        InputFormat.PPTX: FormatOption(
+            pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
+        ),
+        InputFormat.MD: FormatOption(
+            pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
+        ),
+        InputFormat.ASCIIDOC: FormatOption(
+            pipeline_cls=SimplePipeline, backend=AsciiDocBackend
+        ),
+        InputFormat.HTML: FormatOption(
+            pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
+        ),
+        InputFormat.IMAGE: FormatOption(
+            pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
+        ),
+        InputFormat.PDF: FormatOption(
+            pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
+        ),
+    }
+    if (options := format_to_default_options.get(format)) is not None:
+        return options
+    else:
+        raise RuntimeError(f"No default options configured for {format}")
 
 
 class DocumentConverter:
@@ -121,36 +133,26 @@ def __init__(
         allowed_formats: Optional[List[InputFormat]] = None,
         format_options: Optional[Dict[InputFormat, FormatOption]] = None,
     ):
-        self.allowed_formats = allowed_formats
-        self.format_to_options = format_options
-
-        if self.allowed_formats is None:
-            # if self.format_to_options is not None:
-            #    self.allowed_formats = self.format_to_options.keys()
-            # else:
-            self.allowed_formats = [e for e in InputFormat]  # all formats
-
-        if self.format_to_options is None:
-            self.format_to_options = _format_to_default_options
-        else:
-            for f in self.allowed_formats:
-                if f not in self.format_to_options.keys():
-                    _log.debug(f"Requested format {f} will use default options.")
-                    self.format_to_options[f] = _format_to_default_options[f]
-
-            remove_keys = []
-            for f in self.format_to_options.keys():
-                if f not in self.allowed_formats:
-                    remove_keys.append(f)
-
-            for f in remove_keys:
-                self.format_to_options.pop(f)
-
+        self.allowed_formats = (
+            allowed_formats if allowed_formats is not None else [e for e in InputFormat]
+        )
+        self.format_to_options = {
+            format: (
+                _get_default_option(format=format)
+                if (custom_option := (format_options or {}).get(format)) is None
+                else custom_option
+            )
+            for format in self.allowed_formats
+        }
         self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
 
     def initialize_pipeline(self, format: InputFormat):
         """Initialize the conversion pipeline for the selected format."""
-        self._get_pipeline(doc_format=format)
+        pipeline = self._get_pipeline(doc_format=format)
+        if pipeline is None:
+            raise ConversionError(
+                f"No pipeline could be initialized for format {format}"
+            )
 
     @validate_call(config=ConfigDict(strict=True))
     def convert(
@@ -186,22 +188,28 @@ def convert_all(
             limits=limits,
         )
         conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
+
+        had_result = False
         for conv_res in conv_res_iter:
+            had_result = True
             if raises_on_error and conv_res.status not in {
                 ConversionStatus.SUCCESS,
                 ConversionStatus.PARTIAL_SUCCESS,
             }:
-                raise RuntimeError(
+                raise ConversionError(
                     f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
                 )
             else:
                 yield conv_res
 
+        if not had_result and raises_on_error:
+            raise ConversionError(
+                f"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
+            )
+
     def _convert(
         self, conv_input: _DocumentConversionInput, raises_on_error: bool
     ) -> Iterator[ConversionResult]:
-        assert self.format_to_options is not None
-
         start_time = time.monotonic()
 
         for input_batch in chunkify(
@@ -223,27 +231,22 @@ def _convert(
             ):
                 elapsed = time.monotonic() - start_time
                 start_time = time.monotonic()
-
-                if item is not None:
-                    _log.info(
-                        f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
-                    )
-                    yield item
-                else:
-                    _log.info(f"Skipped a document. We lost {elapsed:.2f} sec.")
+                _log.info(
+                    f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
+                )
+                yield item
 
     def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
-        assert self.format_to_options is not None
-
         fopt = self.format_to_options.get(doc_format)
 
         if fopt is None:
-            raise RuntimeError(f"Could not get pipeline for {doc_format}")
+            return None
         else:
             pipeline_class = fopt.pipeline_cls
             pipeline_options = fopt.pipeline_options
 
-        assert pipeline_options is not None
+        if pipeline_options is None:
+            return None
         # TODO this will ignore if different options have been defined for the same pipeline class.
         if (
             pipeline_class not in self.initialized_pipelines
@@ -257,11 +260,26 @@ def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
 
     def _process_document(
         self, in_doc: InputDocument, raises_on_error: bool
-    ) -> Optional[ConversionResult]:
-        assert self.allowed_formats is not None
-        assert in_doc.format in self.allowed_formats
+    ) -> ConversionResult:
 
-        conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
+        valid = (
+            self.allowed_formats is not None and in_doc.format in self.allowed_formats
+        )
+        if valid:
+            conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
+        else:
+            error_message = f"File format not allowed: {in_doc.file}"
+            if raises_on_error:
+                raise ConversionError(error_message)
+            else:
+                error_item = ErrorItem(
+                    component_type=DoclingComponentType.USER_INPUT,
+                    module_name="",
+                    error_message=error_message,
+                )
+                conv_res = ConversionResult(
+                    input=in_doc, status=ConversionStatus.SKIPPED, errors=[error_item]
+                )
 
         return conv_res
 
@@ -270,26 +288,28 @@ def _execute_pipeline(
     ) -> ConversionResult:
         if in_doc.valid:
             pipeline = self._get_pipeline(in_doc.format)
-            if pipeline is None:  # Can't find a default pipeline. Should this raise?
+            if pipeline is not None:
+                conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
+            else:
                 if raises_on_error:
-                    raise RuntimeError(
+                    raise ConversionError(
                         f"No pipeline could be initialized for {in_doc.file}."
                     )
                 else:
-                    conv_res = ConversionResult(input=in_doc)
-                    conv_res.status = ConversionStatus.FAILURE
-                    return conv_res
-
-            conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
-
+                    conv_res = ConversionResult(
+                        input=in_doc,
+                        status=ConversionStatus.FAILURE,
+                    )
         else:
             if raises_on_error:
-                raise RuntimeError(f"Input document {in_doc.file} is not valid.")
+                raise ConversionError(f"Input document {in_doc.file} is not valid.")
 
             else:
                 # invalid doc or not of desired format
-                conv_res = ConversionResult(input=in_doc)
-                conv_res.status = ConversionStatus.FAILURE
+                conv_res = ConversionResult(
+                    input=in_doc,
+                    status=ConversionStatus.FAILURE,
+                )
                 # TODO add error log why it failed.
 
         return conv_res
diff --git a/docling/exceptions.py b/docling/exceptions.py
new file mode 100644
index 00000000..13145b9c
--- /dev/null
+++ b/docling/exceptions.py
@@ -0,0 +1,6 @@
+class BaseError(RuntimeError):
+    pass
+
+
+class ConversionError(BaseError):
+    pass
diff --git a/tests/test_interfaces.py b/tests/test_interfaces.py
index a6675846..23bc3345 100644
--- a/tests/test_interfaces.py
+++ b/tests/test_interfaces.py
@@ -10,7 +10,7 @@
 
 from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
 
-GENERATE = True
+GENERATE = False
 
 
 def get_pdf_path():
diff --git a/tests/test_invalid_input.py b/tests/test_invalid_input.py
new file mode 100644
index 00000000..f40d79e4
--- /dev/null
+++ b/tests/test_invalid_input.py
@@ -0,0 +1,45 @@
+from io import BytesIO
+from pathlib import Path
+
+import pytest
+
+from docling.datamodel.base_models import ConversionStatus, DocumentStream
+from docling.document_converter import ConversionError, DocumentConverter
+
+
+def get_pdf_path():
+
+    pdf_path = Path("./tests/data/2305.03393v1-pg9.pdf")
+    return pdf_path
+
+
+@pytest.fixture
+def converter():
+    converter = DocumentConverter()
+
+    return converter
+
+
+def test_convert_unsupported_doc_format_wout_exception(converter: DocumentConverter):
+    result = converter.convert(
+        DocumentStream(name="input.xyz", stream=BytesIO(b"xyz")), raises_on_error=False
+    )
+    assert result.status == ConversionStatus.SKIPPED
+
+
+def test_convert_unsupported_doc_format_with_exception(converter: DocumentConverter):
+    with pytest.raises(ConversionError):
+        converter.convert(
+            DocumentStream(name="input.xyz", stream=BytesIO(b"xyz")),
+            raises_on_error=True,
+        )
+
+
+def test_convert_too_small_filesize_limit_wout_exception(converter: DocumentConverter):
+    result = converter.convert(get_pdf_path(), max_file_size=1, raises_on_error=False)
+    assert result.status == ConversionStatus.FAILURE
+
+
+def test_convert_too_small_filesize_limit_with_exception(converter: DocumentConverter):
+    with pytest.raises(ConversionError):
+        converter.convert(get_pdf_path(), max_file_size=1, raises_on_error=True)