Skip to content

Commit

Permalink
fix: set valid=false for invalid backends (#171)
Browse files Browse the repository at this point in the history
* fix: set valid=false for invalid backends

Signed-off-by: Michele Dolfi <[email protected]>

* Add test case for InputDocument

Signed-off-by: Christoph Auer <[email protected]>

---------

Signed-off-by: Michele Dolfi <[email protected]>
Signed-off-by: Christoph Auer <[email protected]>
Co-authored-by: Christoph Auer <[email protected]>
  • Loading branch information
dolfim-ibm and cau-git authored Oct 23, 2024
1 parent b8d2286 commit 3496b48
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 0 deletions.
4 changes: 4 additions & 0 deletions docling/datamodel/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,11 +143,13 @@ def __init__(
self.valid = False

except (FileNotFoundError, OSError) as e:
self.valid = False
_log.exception(
f"File {self.file.name} not found or cannot be opened.", exc_info=e
)
# raise
except RuntimeError as e:
self.valid = False
_log.exception(
f"An unexpected error occurred while opening the document {self.file.name}",
exc_info=e,
Expand All @@ -166,6 +168,8 @@ def _init_doc(
)

self._backend = backend(self, path_or_stream=path_or_stream)
if not self._backend.is_valid():
self.valid = False


class DocumentFormat(str, Enum):
Expand Down
58 changes: 58 additions & 0 deletions tests/test_input_doc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from io import BytesIO
from pathlib import Path

from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import DocumentStream, InputFormat
from docling.datamodel.document import InputDocument


def test_in_doc_from_valid_path():

test_doc_path = Path("./tests/data/2206.01062.pdf")
doc = _make_input_doc(test_doc_path)
assert doc.valid == True


def test_in_doc_from_invalid_path():
test_doc_path = Path("./tests/does/not/exist.pdf")

doc = _make_input_doc(test_doc_path)

assert doc.valid == False


def test_in_doc_from_valid_buf():

buf = BytesIO(Path("./tests/data/2206.01062.pdf").open("rb").read())
stream = DocumentStream(name="my_doc.pdf", stream=buf)

doc = _make_input_doc_from_stream(stream)
assert doc.valid == True


def test_in_doc_from_invalid_buf():

buf = BytesIO(b"")
stream = DocumentStream(name="my_doc.pdf", stream=buf)

doc = _make_input_doc_from_stream(stream)
assert doc.valid == False


def _make_input_doc(path):
in_doc = InputDocument(
path_or_stream=path,
format=InputFormat.PDF,
backend=PyPdfiumDocumentBackend,
)
return in_doc


def _make_input_doc_from_stream(doc_stream):
in_doc = InputDocument(
path_or_stream=doc_stream.stream,
format=InputFormat.PDF,
filename=doc_stream.name,
backend=PyPdfiumDocumentBackend,
)
return in_doc

0 comments on commit 3496b48

Please sign in to comment.