From 24560c53cd3d65a6151afc9defd8560d09678c2b Mon Sep 17 00:00:00 2001 From: Philippe Prados Date: Mon, 10 Mar 2025 09:50:00 +0100 Subject: [PATCH 1/2] Fix isssue 30098 https://github.com/langchain-ai/langchain/issues/30098 --- .../langchain_community/document_loaders/parsers/pdf.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py index 782edddad44af..904d74756e316 100644 --- a/libs/community/langchain_community/document_loaders/parsers/pdf.py +++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py @@ -429,6 +429,7 @@ def extract_images_from_page(self, page: pypdf._page.PageObject) -> str: if not self.images_parser: return "" from PIL import Image + import pypdf if "/XObject" not in cast(dict, page["/Resources"]).keys(): return "" @@ -438,13 +439,16 @@ def extract_images_from_page(self, page: pypdf._page.PageObject) -> str: for obj in xObject: np_image: Any = None if xObject[obj]["/Subtype"] == "/Image": - if xObject[obj]["/Filter"][1:] in _PDF_FILTER_WITHOUT_LOSS: + img_filter = xObject[obj]["/Filter"][1:] if type( + xObject[obj]["/Filter"]) == pypdf.generic._base.NameObject else \ + xObject[obj]["/Filter"][0][1:] + if img_filter in _PDF_FILTER_WITHOUT_LOSS: height, width = xObject[obj]["/Height"], xObject[obj]["/Width"] np_image = np.frombuffer( xObject[obj].get_data(), dtype=np.uint8 ).reshape(height, width, -1) - elif xObject[obj]["/Filter"][1:] in _PDF_FILTER_WITH_LOSS: + elif img_filter in _PDF_FILTER_WITH_LOSS: np_image = np.array(Image.open(io.BytesIO(xObject[obj].get_data()))) else: From 5f88c28f95926e10428e2fb318235c4f3b0c8777 Mon Sep 17 00:00:00 2001 From: Philippe Prados Date: Wed, 26 Mar 2025 08:38:06 +0100 Subject: [PATCH 2/2] Fix bug 30098 --- .../document_loaders/parsers/pdf.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py index 904d74756e316..7824181b9e7f8 100644 --- a/libs/community/langchain_community/document_loaders/parsers/pdf.py +++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py @@ -428,8 +428,8 @@ def extract_images_from_page(self, page: pypdf._page.PageObject) -> str: """ if not self.images_parser: return "" - from PIL import Image import pypdf + from PIL import Image if "/XObject" not in cast(dict, page["/Resources"]).keys(): return "" @@ -439,9 +439,11 @@ def extract_images_from_page(self, page: pypdf._page.PageObject) -> str: for obj in xObject: np_image: Any = None if xObject[obj]["/Subtype"] == "/Image": - img_filter = xObject[obj]["/Filter"][1:] if type( - xObject[obj]["/Filter"]) == pypdf.generic._base.NameObject else \ - xObject[obj]["/Filter"][0][1:] + img_filter = ( + xObject[obj]["/Filter"][1:] + if type(xObject[obj]["/Filter"]) is pypdf.generic._base.NameObject + else xObject[obj]["/Filter"][0][1:] + ) if img_filter in _PDF_FILTER_WITHOUT_LOSS: height, width = xObject[obj]["/Height"], xObject[obj]["/Width"]