Skip to content
15 changes: 14 additions & 1 deletion _test_unstructured_client/unit/test_pdf_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pytest
from pypdf import PdfReader

from unstructured_client._hooks.custom.pdf_utils import check_pdf, PDFValidationError
from unstructured_client._hooks.custom.pdf_utils import check_pdf, read_pdf, PDFValidationError
from _test_unstructured_client.unit_utils import sample_docs_path


Expand All @@ -23,6 +23,7 @@ def test_check_pdf_with_valid_pdf():
assert isinstance(result, PdfReader)


# TODO(klaijan) - add pdf file when file is ready
@pytest.mark.parametrize(
("pdf_name", "expected_error_message"),
[
Expand Down Expand Up @@ -51,3 +52,15 @@ def test_check_pdf_raises_pdf_validation_error(
check_pdf(pdf)

assert exc_info.value.message == expected_error_message


# TODO(klaijan) - uncomment when file is ready
"""
def test_check_read_pdf():
pdf_path = sample_docs_path(".pdf")
with open(pdf_path, "rb") as f:
pdf_content = f.read()
pdf = read_pdf(pdf_content)
result = check_pdf(pdf)
assert isinstance(result, PdfReader)
"""
50 changes: 47 additions & 3 deletions src/unstructured_client/_hooks/custom/pdf_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,43 @@ def __init__(self, message: str):


def read_pdf(pdf_file: Union[BinaryIO, bytes]) -> Optional[PdfReader]:
reader = read_pdf_raw(pdf_file=pdf_file)
if reader:
return reader

# TODO(klaijan) - remove once debugged
pdf_logger.debug("Primary PdfReader parse failed, attempting multipart and raw extraction fallbacks.")

# Load raw bytes
# case bytes
if isinstance(pdf_file, bytes):
raw = pdf_file
# case BinaryIO
elif hasattr(pdf_file, "read"):
try:
pdf_file.seek(0)
raw = pdf_file.read()
except Exception as e:
raise IOError(f"Failed to read file stream: {e}") from e
else:
raise IOError("Expected bytes or a file-like object with 'read()' method")

# breakpoint()
# This looks for %PDF-
try:
start = raw.find(b"%PDF-")
end = raw.find(b"%%EOF") + len(b"%%EOF")
if start != -1:
sliced = raw[start:end]
pdf = PdfReader(io.BytesIO(sliced), strict=False)
return check_pdf(pdf)
except Exception as e:
pdf_logger.debug("%%PDF- slicing fallback failed: %s", e)

return None


def read_pdf_raw(pdf_file: Union[BinaryIO, bytes]) -> Optional[PdfReader]:
"""Reads the given PDF file.

Args:
Expand All @@ -34,13 +71,20 @@ def read_pdf(pdf_file: Union[BinaryIO, bytes]) -> Optional[PdfReader]:
Returns:
The PdfReader object if the file is a PDF, None otherwise.
"""

try:
if isinstance(pdf_file, bytes):
content = cast(bytes, pdf_file)
pdf_file = io.BytesIO(content)
return PdfReader(pdf_file, strict=False)
except (PdfReadError, UnicodeDecodeError):
reader = PdfReader(pdf_file, strict=False)
return check_pdf(reader)
except (PdfReadError, UnicodeDecodeError) as e:
pdf_logger.debug("Read pdf failed: %s", e)
return None
except PDFValidationError as e:
pdf_logger.debug("Check pdf failed: %s", e)
return None
except Exception as e:
pdf_logger.debug("An unexpected error occurred: %s", e)
return None


Expand Down