From 54741ef7b5c6ee95a175be87f569c210084d587b Mon Sep 17 00:00:00 2001 From: Philippe Prados Date: Tue, 29 Apr 2025 17:50:39 +0200 Subject: [PATCH 1/2] Propose @deprecated for pdfs --- .../document_loaders/parsers/pdf.py | 1162 +++++++++++++---- .../document_loaders/pdf.py | 471 +++++-- 2 files changed, 1256 insertions(+), 377 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py index 6b3a0a06..26daa1c3 100644 --- a/libs/community/langchain_community/document_loaders/parsers/pdf.py +++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py @@ -2,6 +2,7 @@ from __future__ import annotations +import asyncio import html import io import logging @@ -9,7 +10,7 @@ import warnings from datetime import datetime from pathlib import Path -from tempfile import TemporaryDirectory +from tempfile import NamedTemporaryFile, TemporaryDirectory from typing import ( TYPE_CHECKING, Any, @@ -28,9 +29,14 @@ import numpy import numpy as np from langchain_core.documents import Document +from langchain_core.prompts import PromptTemplate from langchain_community.document_loaders.base import BaseBlobParser from langchain_community.document_loaders.blob_loaders import Blob +from langchain_community.document_loaders.parsers import ( + LLMImageBlobParser, + TesseractBlobParser, +) from langchain_community.document_loaders.parsers.images import ( BaseImageBlobParser, RapidOCRBlobParser, @@ -97,44 +103,19 @@ def extract_from_images_with_rapidocr( _FORMAT_IMAGE_STR = "\n\n{image_text}\n\n" _JOIN_IMAGES = "\n" _JOIN_TABLES = "\n" -_DEFAULT_PAGES_DELIMITER = "\n\f" +_DEFAULT_PAGE_DELIMITOR = "\n\f" _STD_METADATA_KEYS = {"source", "total_pages", "creationdate", "creator", "producer"} -def _format_inner_image(blob: Blob, content: str, format: str) -> str: - """Format the content of the image with the source of the blob. - - blob: The blob containing the image. - format:: - The format for the parsed output. - - "text" = return the content as is - - "markdown-img" = wrap the content into an image markdown link, w/ link - pointing to (`![body)(#)`] - - "html-img" = wrap the content as the `alt` text of an tag and link to - (`{body}`) - """ - if content: - source = blob.source or "#" - if format == "markdown-img": - content = content.replace("]", r"\\]") - content = f"![{content}]({source})" - elif format == "html-img": - content = f'{html.escape(content, quote=True)} src=' - return content - - def _validate_metadata(metadata: dict[str, Any]) -> dict[str, Any]: - """Validate that the metadata has all the standard keys and the page is an integer. - - The standard keys are: + """Validates the presence of at least the following keys: - source + - page (if mode='page') - total_page - creationdate - creator - producer - - Validate that page is an integer if it is present. """ if not _STD_METADATA_KEYS.issubset(metadata.keys()): raise ValueError("The PDF parser must valorize the standard metadata.") @@ -171,7 +152,7 @@ def _purge_metadata(metadata: dict[str, Any]) -> dict[str, Any]: except ValueError: new_metadata[k] = v elif k in map_key: - # Normalize key with others PDF parser + # Normaliaze key with others PDF parser new_metadata[map_key[k]] = v new_metadata[k] = v elif isinstance(v, str): @@ -181,7 +162,7 @@ def _purge_metadata(metadata: dict[str, Any]) -> dict[str, Any]: return new_metadata -_PARAGRAPH_DELIMITER = [ +_PARAGRAPH_DELIMITOR = [ "\n\n\n", "\n\n", ] # To insert images or table in the middle of the page. @@ -203,7 +184,7 @@ def _recurs_merge_text_and_extras( extras: list[str], text_from_page: str, recurs: bool ) -> Optional[str]: if extras: - for delim in _PARAGRAPH_DELIMITER: + for delim in _PARAGRAPH_DELIMITOR: pos = text_from_page.rfind(delim) if pos != -1: # search penultimate, to bypass an error in footer @@ -234,7 +215,7 @@ def _recurs_merge_text_and_extras( all_extras = "" str_extras = "\n\n".join(filter(lambda x: x, extras)) if str_extras: - all_extras = _PARAGRAPH_DELIMITER[-1] + str_extras + all_extras = _PARAGRAPH_DELIMITOR[-1] + str_extras all_text = text_from_page + all_extras return all_text @@ -272,7 +253,7 @@ class PyPDFParser(BaseBlobParser): parser = PyPDFParser( # password = None, mode = "single", - pages_delimiter = "\n\f", + pages_delimitor = "\n\f", # images_parser = TesseractBlobParser(), ) @@ -293,11 +274,10 @@ def __init__( self, password: Optional[Union[str, bytes]] = None, extract_images: bool = False, - *, + *, # Move on top ? mode: Literal["single", "page"] = "page", - pages_delimiter: str = _DEFAULT_PAGES_DELIMITER, + pages_delimitor: str = _DEFAULT_PAGE_DELIMITOR, images_parser: Optional[BaseImageBlobParser] = None, - images_inner_format: Literal["text", "markdown-img", "html-img"] = "text", extraction_mode: Literal["plain", "layout"] = "plain", extraction_kwargs: Optional[dict[str, Any]] = None, ): @@ -305,24 +285,21 @@ def __init__( Args: password: Optional password for opening encrypted PDFs. - extract_images: Whether to extract images from the PDF. mode: The extraction mode, either "single" for the entire document or "page" for page-wise extraction. - pages_delimiter: A string delimiter to separate pages in single-mode + pages_delimitor: A string delimiter to separate pages in single-mode extraction. + extract_images: Whether to extract images from the PDF. images_parser: Optional image blob parser. - images_inner_format: The format for the parsed output. - - "text" = return the content as is - - "markdown-img" = wrap the content into an image markdown link, w/ link - pointing to (`![body)(#)`] - - "html-img" = wrap the content as the `alt` text of an tag and link to - (`{body}`) - extraction_mode: “plain” for legacy functionality, “layout” extract text - in a fixed width format that closely adheres to the rendered layout in - the source pdf. + extraction_mode: “plain” for legacy functionality, “layout” for experimental + layout mode functionality extraction_kwargs: Optional additional parameters for the extraction process. + Returns: + This method does not directly return data. Use the `parse` or `lazy_parse` + methods to retrieve parsed documents with content and metadata. + Raises: ValueError: If the `mode` is not "single" or "page". """ @@ -333,14 +310,13 @@ def __init__( if extract_images and not images_parser: images_parser = RapidOCRBlobParser() self.images_parser = images_parser - self.images_inner_format = images_inner_format self.password = password self.mode = mode - self.pages_delimiter = pages_delimiter + self.pages_delimitor = pages_delimitor self.extraction_mode = extraction_mode self.extraction_kwargs = extraction_kwargs or {} - def lazy_parse(self, blob: Blob) -> Iterator[Document]: + def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type] """ Lazily parse the blob. Insert image, if possible, between two paragraphs. @@ -359,7 +335,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: import pypdf except ImportError: raise ImportError( - "`pypdf` package not found, please install it with `pip install pypdf`" + "pypdf package not found, please install it with `pip install pypdf`" ) def _extract_text_from_page(page: pypdf.PageObject) -> str: @@ -380,7 +356,7 @@ def _extract_text_from_page(page: pypdf.PageObject) -> str: **self.extraction_kwargs, ) - with blob.as_bytes_io() as pdf_file_obj: + with blob.as_bytes_io() as pdf_file_obj: # type: ignore[attr-defined] pdf_reader = pypdf.PdfReader(pdf_file_obj, password=self.password) doc_metadata = _purge_metadata( @@ -413,7 +389,7 @@ def _extract_text_from_page(page: pypdf.PageObject) -> str: single_texts.append(all_text) if self.mode == "single": yield Document( - page_content=self.pages_delimiter.join(single_texts), + page_content=self.pages_delimitor.join(single_texts), metadata=_validate_metadata(doc_metadata), ) @@ -428,29 +404,23 @@ def extract_images_from_page(self, page: pypdf._page.PageObject) -> str: """ if not self.images_parser: return "" - import pypdf from PIL import Image if "/XObject" not in cast(dict, page["/Resources"]).keys(): return "" - xObject = page["/Resources"]["/XObject"].get_object() + xObject = page["/Resources"]["/XObject"].get_object() # type: ignore[index] images = [] for obj in xObject: - np_image: Any = None + np_image = None if xObject[obj]["/Subtype"] == "/Image": - img_filter = ( - xObject[obj]["/Filter"][1:] - if type(xObject[obj]["/Filter"]) is pypdf.generic._base.NameObject - else xObject[obj]["/Filter"][0][1:] - ) - if img_filter in _PDF_FILTER_WITHOUT_LOSS: + if xObject[obj]["/Filter"][1:] in _PDF_FILTER_WITHOUT_LOSS: height, width = xObject[obj]["/Height"], xObject[obj]["/Width"] np_image = np.frombuffer( xObject[obj].get_data(), dtype=np.uint8 ).reshape(height, width, -1) - elif img_filter in _PDF_FILTER_WITH_LOSS: + elif xObject[obj]["/Filter"][1:] in _PDF_FILTER_WITH_LOSS: np_image = np.array(Image.open(io.BytesIO(xObject[obj].get_data()))) else: @@ -459,9 +429,8 @@ def extract_images_from_page(self, page: pypdf._page.PageObject) -> str: image_bytes = io.BytesIO() Image.fromarray(np_image).save(image_bytes, format="PNG") blob = Blob.from_data(image_bytes.getvalue(), mime_type="image/png") - image_text = next(self.images_parser.lazy_parse(blob)).page_content images.append( - _format_inner_image(blob, image_text, self.images_inner_format) + next(self.images_parser.lazy_parse(blob)).page_content ) return _FORMAT_IMAGE_STR.format( image_text=_JOIN_IMAGES.join(filter(None, images)) @@ -501,7 +470,7 @@ class PDFMinerParser(BaseBlobParser): parser = PDFMinerParser( # password = None, mode = "single", - pages_delimiter = "\n\f", + pages_delimitor = "\n\f", # extract_images = True, # images_to_text = convert_images_to_text_with_tesseract(), ) @@ -527,9 +496,8 @@ def __init__( *, password: Optional[str] = None, mode: Literal["single", "page"] = "single", - pages_delimiter: str = _DEFAULT_PAGES_DELIMITER, + pages_delimitor: str = _DEFAULT_PAGE_DELIMITOR, images_parser: Optional[BaseImageBlobParser] = None, - images_inner_format: Literal["text", "markdown-img", "html-img"] = "text", concatenate_pages: Optional[bool] = None, ): """Initialize a parser based on PDFMiner. @@ -538,15 +506,11 @@ def __init__( password: Optional password for opening encrypted PDFs. mode: Extraction mode to use. Either "single" or "page" for page-wise extraction. - pages_delimiter: A string delimiter to separate pages in single-mode + pages_delimitor: A string delimiter to separate pages in single-mode extraction. extract_images: Whether to extract images from PDF. - images_inner_format: The format for the parsed output. - - "text" = return the content as is - - "markdown-img" = wrap the content into an image markdown link, w/ link - pointing to (`![body)(#)`] - - "html-img" = wrap the content as the `alt` text of an tag and link to - (`{body}`) + images_to_text: Optional function or callable to convert images to text + during extraction. concatenate_pages: Deprecated. If True, concatenate all PDF pages into one a single document. Otherwise, return one document per page. @@ -568,10 +532,9 @@ def __init__( images_parser = RapidOCRBlobParser() self.extract_images = extract_images self.images_parser = images_parser - self.images_inner_format = images_inner_format - self.password = password self.mode = mode - self.pages_delimiter = pages_delimiter + self.pages_delimitor = pages_delimitor + self.password = password if concatenate_pages is not None: if not PDFMinerParser._warn_concatenate_pages: PDFMinerParser._warn_concatenate_pages = True @@ -677,7 +640,7 @@ def _get_metadata( return metadata - def lazy_parse(self, blob: Blob) -> Iterator[Document]: + def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type] """ Lazily parse the blob. Insert image, if possible, between two paragraphs. @@ -693,7 +656,6 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: An iterator over the parsed documents. """ try: - import pdfminer from pdfminer.converter import PDFLayoutAnalyzer from pdfminer.layout import ( LAParams, @@ -706,13 +668,6 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: ) from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager from pdfminer.pdfpage import PDFPage - - if int(pdfminer.__version__) < 20201018: - raise ImportError( - "This parser is tested with pdfminer.six version 20201018 or " - "later. Remove pdfminer, and install pdfminer.six with " - "`pip uninstall pdfminer && pip install pdfminer.six`." - ) except ImportError: raise ImportError( "pdfminer package not found, please install it " @@ -723,8 +678,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: pages = PDFPage.get_pages(pdf_file_obj, password=self.password or "") rsrcmgr = PDFResourceManager() doc_metadata = _purge_metadata( - {"producer": "PDFMiner", "creator": "PDFMiner", "creationdate": ""} - | self._get_metadata(pdf_file_obj, password=self.password or "") + self._get_metadata(pdf_file_obj, password=self.password or "") ) doc_metadata["source"] = blob.source @@ -757,12 +711,10 @@ def render(item: LTItem) -> None: image_text = next( self.images_parser.lazy_parse(blob) ).page_content - - text_io.write( - _format_inner_image( - blob, image_text, self.images_inner_format + if image_text: + text_io.write( + _FORMAT_IMAGE_STR.format(image_text=image_text) ) - ) else: pass @@ -793,8 +745,8 @@ def render(item: LTItem) -> None: all_text = all_text[:-1] all_content.append(all_text) if self.mode == "single": - # Add pages_delimiter between pages - document_content = self.pages_delimiter.join(all_content) + # Add page_delimitor between pages + document_content = self.pages_delimitor.join(all_content) yield Document( page_content=document_content, metadata=_validate_metadata(doc_metadata), @@ -834,7 +786,7 @@ class PyMuPDFParser(BaseBlobParser): parser = PyMuPDFParser( # password = None, mode = "single", - pages_delimiter = "\n\f", + pages_delimitor = "\n\f", # images_parser = TesseractBlobParser(), # extract_tables="markdown", # extract_tables_settings=None, @@ -865,9 +817,8 @@ def __init__( *, password: Optional[str] = None, mode: Literal["single", "page"] = "page", - pages_delimiter: str = _DEFAULT_PAGES_DELIMITER, + pages_delimitor: str = _DEFAULT_PAGE_DELIMITOR, images_parser: Optional[BaseImageBlobParser] = None, - images_inner_format: Literal["text", "markdown-img", "html-img"] = "text", extract_tables: Union[Literal["csv", "markdown", "html"], None] = None, extract_tables_settings: Optional[dict[str, Any]] = None, ) -> None: @@ -877,20 +828,16 @@ def __init__( password: Optional password for opening encrypted PDFs. mode: The extraction mode, either "single" for the entire document or "page" for page-wise extraction. - pages_delimiter: A string delimiter to separate pages in single-mode + pages_delimitor: A string delimiter to separate pages in single-mode extraction. extract_images: Whether to extract images from the PDF. images_parser: Optional image blob parser. - images_inner_format: The format for the parsed output. - - "text" = return the content as is - - "markdown-img" = wrap the content into an image markdown link, w/ link - pointing to (`![body)(#)`] - - "html-img" = wrap the content as the `alt` text of an tag and link to - (`{body}`) extract_tables: Whether to extract tables in a specific format, such as "csv", "markdown", or "html". extract_tables_settings: Optional dictionary of settings for customizing table extraction. + **kwargs: Additional keyword arguments for customizing text extraction + behavior. Returns: This method does not directly return data. Use the `parse` or `lazy_parse` @@ -908,18 +855,17 @@ def __init__( raise ValueError("mode must be markdown") self.mode = mode - self.pages_delimiter = pages_delimiter + self.pages_delimitor = pages_delimitor self.password = password self.text_kwargs = text_kwargs or {} if extract_images and not images_parser: images_parser = RapidOCRBlobParser() self.extract_images = extract_images - self.images_inner_format = images_inner_format self.images_parser = images_parser self.extract_tables = extract_tables self.extract_tables_settings = extract_tables_settings - def lazy_parse(self, blob: Blob) -> Iterator[Document]: + def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type] return self._lazy_parse( blob, ) @@ -927,18 +873,14 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: def _lazy_parse( self, blob: Blob, - # text-kwargs is present for backwards compatibility. - # Users should not use it directly. - text_kwargs: Optional[dict[str, Any]] = None, - ) -> Iterator[Document]: + text_kwargs: Optional[dict[str, Any]] = None, # deprectaed + ) -> Iterator[Document]: # type: ignore[valid-type] """Lazily parse the blob. Insert image, if possible, between two paragraphs. In this way, a paragraph can be continued on the next page. Args: blob: The blob to parse. - text_kwargs: Optional keyword arguments to pass to the `get_text` method. - If provided at run time, it will override the default text_kwargs. Raises: ImportError: If the `pypdf` package is not found. @@ -949,7 +891,8 @@ def _lazy_parse( try: import pymupdf - text_kwargs = text_kwargs or self.text_kwargs + if not text_kwargs: + text_kwargs = {} if not self.extract_tables_settings: from pymupdf.table import ( DEFAULT_JOIN_TOLERANCE, @@ -990,18 +933,14 @@ def _lazy_parse( ) with PyMuPDFParser._lock: - with blob.as_bytes_io() as file_path: - if blob.data is None: + with blob.as_bytes_io() as file_path: # type: ignore[attr-defined] + if blob.data is None: # type: ignore[attr-defined] doc = pymupdf.open(file_path) else: doc = pymupdf.open(stream=file_path, filetype="pdf") if doc.is_encrypted: doc.authenticate(self.password) - doc_metadata = { - "producer": "PyMuPDF", - "creator": "PyMuPDF", - "creationdate": "", - } | self._extract_metadata(doc, blob) + doc_metadata = self._extract_metadata(doc, blob) full_content = [] for page in doc: all_text = self._get_page_content(doc, page, text_kwargs).strip() @@ -1017,7 +956,7 @@ def _lazy_parse( if self.mode == "single": yield Document( - page_content=self.pages_delimiter.join(full_content), + page_content=self.pages_delimitor.join(full_content), metadata=_validate_metadata(doc_metadata), ) @@ -1060,14 +999,14 @@ def _extract_metadata(self, doc: pymupdf.Document, blob: Blob) -> dict: Returns: dict: The extracted metadata. """ - metadata = _purge_metadata( - { - **{ + return _purge_metadata( + dict( + { "producer": "PyMuPDF", "creator": "PyMuPDF", "creationdate": "", - "source": blob.source, - "file_path": blob.source, + "source": blob.source, # type: ignore[attr-defined] + "file_path": blob.source, # type: ignore[attr-defined] "total_pages": len(doc), }, **{ @@ -1075,12 +1014,8 @@ def _extract_metadata(self, doc: pymupdf.Document, blob: Blob) -> dict: for k in doc.metadata if isinstance(doc.metadata[k], (str, int)) }, - } + ) ) - for k in ("modDate", "creationDate"): - if k in doc.metadata: - metadata[k] = doc.metadata[k] - return metadata def _extract_images_from_page( self, doc: pymupdf.Document, page: pymupdf.Page @@ -1112,11 +1047,7 @@ def _extract_images_from_page( blob = Blob.from_data( image_bytes.getvalue(), mime_type="application/x-npy" ) - image_text = next(self.images_parser.lazy_parse(blob)).page_content - - images.append( - _format_inner_image(blob, image_text, self.images_inner_format) - ) + images.append(next(self.images_parser.lazy_parse(blob)).page_content) return _FORMAT_IMAGE_STR.format( image_text=_JOIN_IMAGES.join(filter(None, images)) ) @@ -1201,7 +1132,7 @@ class PyPDFium2Parser(BaseBlobParser): parser = PyPDFium2Parser( # password=None, mode="page", - pages_delimiter="\n\f", + pages_delimitor="\n\f", # extract_images = True, # images_to_text = convert_images_to_text_with_tesseract(), ) @@ -1222,6 +1153,11 @@ class PyPDFium2Parser(BaseBlobParser): # PyPDFium2 is not thread safe. # See https://pypdfium2.readthedocs.io/en/stable/python_api.html#thread-incompatibility _lock = threading.Lock() + warnings.filterwarnings( + "ignore", + module=r"^pypdfium2._helpers.textpage$", + message="get_text_range\\(\\) call with default params will be .*", + ) def __init__( self, @@ -1229,9 +1165,8 @@ def __init__( *, password: Optional[str] = None, mode: Literal["single", "page"] = "page", - pages_delimiter: str = _DEFAULT_PAGES_DELIMITER, + pages_delimitor: str = _DEFAULT_PAGE_DELIMITOR, images_parser: Optional[BaseImageBlobParser] = None, - images_inner_format: Literal["text", "markdown-img", "html-img"] = "text", ) -> None: """Initialize a parser based on PyPDFium2. @@ -1239,20 +1174,10 @@ def __init__( password: Optional password for opening encrypted PDFs. mode: The extraction mode, either "single" for the entire document or "page" for page-wise extraction. - pages_delimiter: A string delimiter to separate pages in single-mode + pages_delimitor: A string delimiter to separate pages in single-mode extraction. extract_images: Whether to extract images from the PDF. images_parser: Optional image blob parser. - images_inner_format: The format for the parsed output. - - "text" = return the content as is - - "markdown-img" = wrap the content into an image markdown link, w/ link - pointing to (`![body)(#)`] - - "html-img" = wrap the content as the `alt` text of an tag and link to - (`{body}`) - extraction_mode: “plain” for legacy functionality, “layout” for experimental - layout mode functionality - extraction_kwargs: Optional additional parameters for the extraction - process. Returns: This method does not directly return data. Use the `parse` or `lazy_parse` @@ -1265,15 +1190,12 @@ def __init__( if mode not in ["single", "page"]: raise ValueError("mode must be single or page") self.extract_images = extract_images - if extract_images and not images_parser: - images_parser = RapidOCRBlobParser() self.images_parser = images_parser - self.images_inner_format = images_inner_format - self.password = password self.mode = mode - self.pages_delimiter = pages_delimiter + self.pages_delimitor = pages_delimitor + self.password = password - def lazy_parse(self, blob: Blob) -> Iterator[Document]: + def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type] """ Lazily parse the blob. Insert image, if possible, between two paragraphs. @@ -1299,7 +1221,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: # pypdfium2 is really finicky with respect to closing things, # if done incorrectly creates seg faults. with PyPDFium2Parser._lock: - with blob.as_bytes_io() as file_path: + with blob.as_bytes_io() as file_path: # type: ignore[attr-defined] pdf_reader = None try: pdf_reader = pypdfium2.PdfDocument( @@ -1307,11 +1229,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: ) full_content = [] - doc_metadata = { - "producer": "PyPDFium2", - "creator": "PyPDFium2", - "creationdate": "", - } | _purge_metadata(pdf_reader.get_metadata_dict()) + doc_metadata = _purge_metadata(pdf_reader.get_metadata_dict()) doc_metadata["source"] = blob.source doc_metadata["total_pages"] = len(pdf_reader) @@ -1345,7 +1263,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: if self.mode == "single": yield Document( - page_content=self.pages_delimiter.join(full_content), + page_content=self.pages_delimitor.join(full_content), metadata=_validate_metadata(doc_metadata), ) finally: @@ -1377,106 +1295,867 @@ def _extract_images_from_page(self, page: pypdfium2._helpers.page.PdfPage) -> st continue numpy.save(image_bytes, image.get_bitmap().to_numpy()) blob = Blob.from_data(image_bytes.getvalue(), mime_type="application/x-npy") - text_from_image = next(self.images_parser.lazy_parse(blob)).page_content - str_images.append( - _format_inner_image(blob, text_from_image, self.images_inner_format) - ) + str_images.append(next(self.images_parser.lazy_parse(blob)).page_content) image.close() return _FORMAT_IMAGE_STR.format(image_text=_JOIN_IMAGES.join(str_images)) +# The legacy PDFPlumberParser use key with upper case. +# This is not aligned with the new convention, which requires the key to be in +# lower case. +class _PDFPlumberParserMetadata(dict[object, Any]): + _warning_keys: set[str] = set() + + def __init__(self, d: dict[str, Any]): + super().__init__({k.lower(): v for k, v in d.items()}) + self._pdf_metadata_keys = set(d.keys()) + + def _lower(self, k: object) -> object: + if k in self._pdf_metadata_keys: + lk = str(k).lower() + if lk != k: + if k not in _PDFPlumberParserMetadata._warning_keys: + _PDFPlumberParserMetadata._warning_keys.add(str(k)) + logger.warning( + 'The key "%s" with uppercase is deprecated. ' + "Update your code and vectorstore.", + k, + ) + return lk + else: + return k + + def __contains__(self, k: object) -> bool: + return super().__contains__(self._lower(k)) + + def __delitem__(self, k: object) -> None: + super().__delitem__(self._lower(k)) + + def __getitem__(self, k: object) -> Any: + return super().__getitem__(self._lower(k)) + + def get(self, k: object, default: Any = None) -> Any: + return super().get(self._lower(k), default) + + def __setitem__(self, k: object, v: Any) -> None: + super().__setitem__(self._lower(k), v) + + class PDFPlumberParser(BaseBlobParser): - """Parse `PDF` with `PDFPlumber`.""" + """Parse a blob from a PDF using `pdfplumber` library. + + This class provides methods to parse a blob from a PDF document, supporting various + configurations such as handling password-protected PDFs, extracting images, and + defining extraction mode. + It integrates the 'pdfplumber' library for PDF processing and offers synchronous + blob parsing. + + Examples: + Setup: + + .. code-block:: bash + + pip install -U langchain-community pdfplumber + + Load a blob from a PDF file: + + .. code-block:: python + + from langchain_core.documents.base import Blob + + blob = Blob.from_path("./example_data/layout-parser-paper.pdf") + + Instantiate the parser: + + .. code-block:: python + + from langchain_community.document_loaders.parsers import PDFPlumberParser + + parser = PDFPlumberParser( + # password = None, + mode = "single", + pages_delimitor = "\n\f", + # images_to_text = convert_images_to_text_with_tesseract(), + # extract_tables="markdown", + ) + + Lazily parse the blob: + + .. code-block:: python + + docs = [] + docs_lazy = parser.lazy_parse(blob) + + for doc in docs_lazy: + docs.append(doc) + print(docs[0].page_content[:100]) + print(docs[0].metadata) + """ def __init__( self, text_kwargs: Optional[Mapping[str, Any]] = None, dedupe: bool = False, extract_images: bool = False, + *, + password: Optional[str] = None, + mode: Literal["single", "page"] = "page", + pages_delimitor: str = _DEFAULT_PAGE_DELIMITOR, + images_parser: Optional[BaseImageBlobParser] = None, + extract_tables: Optional[Literal["csv", "markdown", "html"]] = None, + extract_tables_settings: Optional[dict[str, Any]] = None, ) -> None: """Initialize the parser. Args: + password: Optional password for opening encrypted PDFs. + mode: The extraction mode, either "single" for the entire document or "page" + for page-wise extraction. + pages_delimitor: A string delimiter to separate pages in single-mode + extraction. + extract_images: Whether to extract images from the PDF. + images_parser: Optional image blob parser. + extract_tables: Whether to extract images from the PDF in a specific + format, such as "csv", "markdown" or "html". text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()`` - dedupe: Avoiding the error of duplicate characters if `dedupe=True`. + dedupe: Avoiding the error of duplicate characters if `dedupe=True` + extract_tables_settings: Optional dictionary of settings for customizing + table extraction. + + Returns: + This method does not directly return data. Use the `parse` or `lazy_parse` + methods to retrieve parsed documents with content and metadata. + + Raises: + ValueError: If the `mode` is not "single" or "page". + ValueError: If the `extract_tables` is not "csv", "markdown" or "html". + + """ + super().__init__() + if mode not in ["single", "page"]: + raise ValueError("mode must be single or page") + if extract_tables and extract_tables not in ["csv", "markdown", "html"]: + raise ValueError("mode must be csv, markdown or html") + if not extract_images and not images_parser: + images_parser = RapidOCRBlobParser() + self.password = password + self.extract_images = extract_images + self.images_parser = images_parser + self.mode = mode + self.pages_delimitor = pages_delimitor + self.dedupe = dedupe + self.text_kwargs = text_kwargs or {} + self.extract_tables = extract_tables + self.extract_tables_settings = extract_tables_settings or { + "vertical_strategy": "lines", + "horizontal_strategy": "lines", + "snap_y_tolerance": 5, + "intersection_x_tolerance": 15, + } + + def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type] + """ + Lazily parse the blob. + Insert image, if possible, between two paragraphs. + In this way, a paragraph can be continued on the next page. + + Args: + blob: The blob to parse. + + Raises: + ImportError: If the `pypdf` package is not found. + + Yield: + An iterator over the parsed documents. """ try: - import PIL # noqa:F401 + import pdfplumber except ImportError: raise ImportError( - "pillow package not found, please install it with `pip install pillow`" + "pdfplumber package not found, please install it " + "with `pip install pdfplumber`" ) - self.text_kwargs = text_kwargs or {} - self.dedupe = dedupe - self.extract_images = extract_images - def lazy_parse(self, blob: Blob) -> Iterator[Document]: - """Lazily parse the blob.""" - import pdfplumber - - with blob.as_bytes_io() as file_path: - doc = pdfplumber.open(file_path) # open document - - yield from [ - Document( - page_content=self._process_page_content(page) - + "\n" - + self._extract_images_from_page(page), - metadata=dict( - { - "source": blob.source, - "file_path": blob.source, - "page": page.page_number - 1, - "total_pages": len(doc.pages), - }, - **{ - k: doc.metadata[k] - for k in doc.metadata - if type(doc.metadata[k]) in [str, int] - }, + with blob.as_bytes_io() as file_path: # type: ignore[attr-defined] + doc = pdfplumber.open(file_path, password=self.password) # open document + from pdfplumber.utils import geometry # import WordExctractor, TextMap + + contents = [] + doc_metadata = _purge_metadata( + ( + doc.metadata + | { + "source": blob.source, + "file_path": blob.source, + "total_pages": len(doc.pages), + } + ) + ) + for page in doc.pages: + tables_bbox: list[tuple[float, float, float, float]] = ( + self._extract_tables_bbox_from_page(page) + ) + tables_content = self._extract_tables_from_page(page) + images_bbox = [geometry.obj_to_bbox(image) for image in page.images] + image_from_page = self._extract_images_from_page(page) + page_text = [] + extras = [] + for content in self._split_page_content( + page, + tables_bbox, + tables_content, + images_bbox, + image_from_page, + ): + if isinstance(content, str): # Text + page_text.append(content) + elif isinstance(content, list): # Table + page_text.append(_JOIN_TABLES + self._convert_table(content)) + else: # Image + image_bytes = io.BytesIO() + numpy.save(image_bytes, content) + blob = Blob.from_data( + image_bytes.getvalue(), mime_type="application/x-npy" + ) + extras.append( + next(self.images_parser.lazy_parse(blob)).page_content + ) + + all_text = _merge_text_and_extras(extras, "".join(page_text).strip()) + + if self.mode == "page": + # For legacy compatibility, add the last '\n'_ + if not all_text.endswith("\n"): + all_text += "\n" + yield Document( + page_content=all_text, + metadata=_validate_metadata( + _PDFPlumberParserMetadata( + doc_metadata + | { + "page": page.page_number - 1, + } + ) + ), + ) + else: + contents.append(all_text) + # "tables_as_html": [self._convert_table_to_html(table) + # for + # table in tables_content], + # "images": images_content, + # tables_as_html.extend([self._convert_table(table) + # for + # table in tables_content]) + if self.mode == "single": + yield Document( + page_content=self.pages_delimitor.join(contents), + metadata=_validate_metadata( + _PDFPlumberParserMetadata(doc_metadata) ), ) - for page in doc.pages - ] def _process_page_content(self, page: pdfplumber.page.Page) -> str: - """Process the page content based on dedupe.""" + """Process the page content based on dedupe. + + Args: + page: The PDF page to process. + + Returns: + The extracted text from the page. + """ if self.dedupe: return page.dedupe_chars().extract_text(**self.text_kwargs) return page.extract_text(**self.text_kwargs) - def _extract_images_from_page(self, page: pdfplumber.page.Page) -> str: - """Extract images from page and get the text with RapidOCR.""" + def _split_page_content( + self, + page: pdfplumber.page.Page, + tables_bbox: list[tuple[float, float, float, float]], + tables_content: list[list[list[Any]]], + images_bbox: list[tuple[float, float, float, float]], + images_content: list[np.ndarray], + **kwargs: Any, + ) -> Iterator[Union[str, list[list[str]], np.ndarray]]: + """Split the page content into text, tables, and images. + + Args: + page: The PDF page to process. + tables_bbox: Bounding boxes of tables on the page. + tables_content: Content of tables on the page. + images_bbox: Bounding boxes of images on the page. + images_content: Content of images on the page. + **kwargs: Additional keyword arguments. + + Yields: + An iterator over the split content (text, tables, images). + """ + from pdfplumber.utils import ( + geometry, + text, + ) + + # Iterate over words. If a word is in a table, + # yield the accumulated text, and the table + # A the word is in a previously see table, ignore it + # Finish with the accumulated text + kwargs.update( + { + "keep_blank_chars": True, + # "use_text_flow": True, + "presorted": True, + "layout_bbox": kwargs.get("layout_bbox") + # or geometry.objects_to_bbox(page.chars), + or page.cropbox, + } + ) + chars = page.dedupe_chars().objects["char"] if self.dedupe else page.chars + + extractor = text.WordExtractor( + **{k: kwargs[k] for k in text.WORD_EXTRACTOR_KWARGS if k in kwargs} + ) + wordmap = extractor.extract_wordmap(chars) + extract_wordmaps: list[Any] = [] + used_arrays = [False] * len(tables_bbox) + for word, o in wordmap.tuples: + # print(f" Try with '{word['text']}' ...") + is_table = False + word_bbox = geometry.obj_to_bbox(word) + for i, table_bbox in enumerate(tables_bbox): + if geometry.get_bbox_overlap(word_bbox, table_bbox): + # Find a world in a table + # print(" Find in an array") + is_table = True + if not used_arrays[i]: + # First time I see a word in this array + # Yield the previous part + if extract_wordmaps: + new_wordmap = text.WordMap(tuples=extract_wordmaps) + new_textmap = new_wordmap.to_textmap( + **{ + k: kwargs[k] + for k in text.TEXTMAP_KWARGS + if k in kwargs + } + ) + # print(f"yield {new_textmap.to_string()}") + yield new_textmap.to_string() + extract_wordmaps.clear() + # and yield the table + used_arrays[i] = True + # print(f"yield table {i}") + yield tables_content[i] + break + if not is_table: + # print(f' Add {word["text"]}') + extract_wordmaps.append((word, o)) + if extract_wordmaps: + # Text after the array ? + new_wordmap = text.WordMap(tuples=extract_wordmaps) + new_textmap = new_wordmap.to_textmap( + **{k: kwargs[k] for k in text.TEXTMAP_KWARGS if k in kwargs} + ) + # print(f"yield {new_textmap.to_string()}") + yield new_textmap.to_string() + # Add images- + for content in images_content: + yield content + + def _extract_images_from_page(self, page: pdfplumber.page.Page) -> list[np.ndarray]: + """Extract images from a PDF page. + + Args: + page: The PDF page to extract images from. + + Returns: + A list of extracted images as numpy arrays. + """ from PIL import Image - if not self.extract_images: - return "" + if not self.images_parser: + return [] images = [] for img in page.images: - if img["stream"]["Filter"].name in _PDF_FILTER_WITHOUT_LOSS: - if img["stream"]["BitsPerComponent"] == 1: - images.append( - np.array( - Image.frombytes( - "1", - (img["stream"]["Width"], img["stream"]["Height"]), - img["stream"].get_data(), - ).convert("L") - ) - ) - else: + if "Filter" in img["stream"]: + if img["stream"]["Filter"].name in _PDF_FILTER_WITHOUT_LOSS: images.append( np.frombuffer(img["stream"].get_data(), dtype=np.uint8).reshape( img["stream"]["Height"], img["stream"]["Width"], -1 ) ) - elif img["stream"]["Filter"].name in _PDF_FILTER_WITH_LOSS: - images.append(img["stream"].get_data()) + elif img["stream"]["Filter"].name in _PDF_FILTER_WITH_LOSS: + buf = np.frombuffer(img["stream"].get_data(), dtype=np.uint8) + images.append(np.array(Image.open(io.BytesIO(buf.tobytes())))) + else: + logger.warning("Unknown PDF Filter!") + + return images + + def _extract_tables_bbox_from_page( + self, + page: pdfplumber.page.Page, + ) -> list[tuple]: + """Extract bounding boxes of tables from a PDF page. + + Args: + page: The PDF page to extract table bounding boxes from. + + Returns: + A list of bounding boxes for tables on the page. + """ + if not self.extract_tables: + return [] + from pdfplumber.table import TableSettings + + table_settings = self.extract_tables_settings + tset = TableSettings.resolve(table_settings) + return [table.bbox for table in page.find_tables(tset)] + + def _extract_tables_from_page( + self, + page: pdfplumber.page.Page, + ) -> list[list[list[Any]]]: + """Extract tables from a PDF page. + + Args: + page: The PDF page to extract tables from. + + Returns: + A list of tables, where each table is a list of rows, and each row is a + list of cell values. + """ + if not self.extract_tables: + return [] + table_settings = self.extract_tables_settings + tables_list = page.extract_tables(table_settings) + return tables_list + + def _convert_table(self, table: list[list[str]]) -> str: + """Convert a table to the specified format. + + Args: + table: The table to convert. + + Returns: + The table content as a string in the specified format. + """ + format = self.extract_tables + if format is None: + return "" + if format == "markdown": + return self._convert_table_to_markdown(table) + elif format == "html": + return self._convert_table_to_html(table) + elif format == "csv": + return self._convert_table_to_csv(table) + else: + raise ValueError(f"Unknown table format: {format}") + + def _convert_table_to_csv(self, table: list[list[str]]) -> str: + """Convert a table to CSV format. + + Args: + table: The table to convert. + + Returns: + The table content as a string in CSV format. + """ + if not table: + return "" + + output = ["\n\n"] + + # skip first row in details if header is part of the table + # j = 0 if self.header.external else 1 + + # iterate over detail rows + for row in table: + line = "" + for i, cell in enumerate(row): + # output None cells with empty string + cell = "" if cell is None else cell.replace("\n", " ") + line += cell + "," + output.append(line) + return "\n".join(output) + "\n\n" + + def _convert_table_to_html(self, table: list[list[str]]) -> str: + """ + Convert table content as a string in HTML format. + If clean is true, markdown syntax is removed from cell content. + + Args: + table: The table to convert. + + Returns: + The table content as a string in HTML format. + """ + if not len(table): + return "" + output = "\n" + clean = True + + # iterate over detail rows + for row in table: + line = "" + for i, cell in enumerate(row): + # output None cells with empty string + cell = "" if cell is None else cell.replace("\n", " ") + if clean: # remove sensitive syntax + cell = html.escape(cell.replace("-", "-")) + line += "" + line += "\n" + output += line + return output + "
" + cell + "
\n" + + def _convert_table_to_markdown(self, table: list[list[str]]) -> str: + """Convert table content as a string in Github-markdown format. + + Args: + table: The table to convert. + + Returns: + The table content as a string in Markdown format. + """ + clean = False + if not table: + return "" + col_count = len(table[0]) + + output = "|" + "|".join("" for i in range(col_count)) + "|\n" + output += "|" + "|".join("---" for i in range(col_count)) + "|\n" + + # skip first row in details if header is part of the table + # j = 0 if self.header.external else 1 + + # iterate over detail rows + for row in table: + line = "|" + for i, cell in enumerate(row): + # output None cells with empty string + cell = "" if cell is None else cell.replace("\n", " ") + if clean: # remove sensitive syntax + cell = html.escape(cell.replace("-", "-")) + line += cell + "|" + line += "\n" + output += line + return output + "\n" + + +class ZeroxPDFParser(BaseBlobParser): + """Parse a blob from a PDF using `py-zerox` library. + + This class provides methods to parse a blob from a PDF document, supporting various + configurations such as handling password-protected PDFs, extracting images. + It integrates the 'py-zerox' library for PDF processing and offers synchronous blob + parsing. + + Examples: + Setup: + + .. code-block:: bash + + pip install -U langchain-community py-zerox + + Load a blob from a PDF file: + + .. code-block:: python + + from langchain_core.documents.base import Blob + + blob = Blob.from_path("./example_data/layout-parser-paper.pdf") + + Instantiate the parser: + + .. code-block:: python + + from langchain_community.document_loaders.parsers import ZeroxPDFParser + + parser = ZeroxPDFParser( + # password = None, + mode = "single", + pages_delimitor = "\n\f", + # extract_images = True, + # images_to_text = convert_images_to_text_with_tesseract(), + ) + + Lazily parse the blob: + + .. code-block:: python + + docs = [] + docs_lazy = parser.lazy_parse(blob) + + for doc in docs_lazy: + docs.append(doc) + print(docs[0].page_content[:100]) + print(docs[0].metadata) + """ + + warnings.filterwarnings( + "ignore", + module=r"^pyzerox.models.modellitellm$", + message=r"\s*Custom system prompt was provided which.*", + ) + _warn_images_to_text = False + _warn_creator = False + _map_extract_tables = { + "markdown": "", + "html": "But, use html syntax for convert all tables. ", + } + _map_extract_images = { + RapidOCRBlobParser: "", + TesseractBlobParser: "", + LLMImageBlobParser: "If you come across a picture, " + "diagram or other illustration, " + "describe it. ", + } + _prompt = PromptTemplate.from_template( + "Convert the following PDF page to markdown. " + "{prompt_tables}" + "{prompt_images}" + "Remove the header, footer and page number. " + "Return only the markdown with no explanation text. " + "Do not exclude any content from the page. ", + ) + + def __init__( + self, + mode: Literal["single", "page"] = "page", + pages_delimitor: str = _DEFAULT_PAGE_DELIMITOR, + images_parser: Optional[BaseImageBlobParser] = None, + extract_images: bool = True, + extract_tables: Union[Literal["markdown", "html"], None] = "markdown", + cleanup: bool = True, + concurrency: int = 10, + maintain_format: bool = False, + model: str = "gpt-4o-mini", + custom_system_prompt: Optional[str] = None, + select_pages: Optional[Union[int, Iterable[int]]] = None, + **zerox_kwargs: dict[str, Any], + ): + """ + Initialize the parser with arguments to be passed to the zerox function. + Make sure to set necessary environment variables such as API key, endpoint, etc. + Check zerox documentation for list of necessary environment variables for + any given model. + + Args: + mode: The extraction mode, either "single" for the entire document or "page" + for page-wise extraction. + pages_delimitor: A string delimiter to separate pages in single-mode + extraction. + extract_images: Whether to extract images from the PDF. + images_to_text: Optional function or callable to convert images to text + during extraction. + model: + Vision capable model to use. Defaults to "gpt-4o-mini". + Hosted models are passed in format "/" + Examples: "azure/gpt-4o-mini", "vertex_ai/gemini-1.5-flash-001" + See more details in zerox documentation. + cleanup: + Whether to cleanup the temporary files after processing, defaults + to True + concurrency: + The number of concurrent processes to run, defaults to 10 + maintain_format: + Whether to maintain the format from the previous page, defaults to False + model: + The model to use for generating completions, defaults to "gpt-4o-mini". + Note - Refer: https://docs.litellm.ai/docs/providers to pass correct + model name as according to provider it might be different from actual + name. + output_dir: + The directory to save the markdown output, defaults to None + temp_dir: + The directory to store temporary files, defaults to some named folder + in system's temp directory. If already exists, the contents will be + deleted for zerox uses it. + custom_system_prompt: + The system prompt to use for the model, this overrides the default + system prompt of zerox. Generally it is not required unless you want + some specific behaviour. When set, it will raise a friendly warning, + defaults to None + select_pages: + Pages to process, can be a single page number or an iterable of page + numbers, defaults to None + **zerox_kwargs: + Arguments specific to the zerox function. + """ + if mode not in ["single", "page"]: + raise ValueError("mode must be single or page") + if extract_tables not in ["markdown", "html", None]: + logger.warning("extract_tables must be markdown or html") + extract_tables = "markdown" + if images_parser and not images_parser: + images_parser = RapidOCRBlobParser() + self.mode = mode + self.pages_delimitor = pages_delimitor + self.extract_images = extract_images + self.images_parser = images_parser + self.extract_tables = extract_tables + + self.cleanup = cleanup + self.concurrency = concurrency + self.maintain_format = maintain_format + self.model = model + if not custom_system_prompt: + custom_system_prompt = ZeroxPDFParser._prompt + self.custom_system_prompt = custom_system_prompt + self.select_pages = select_pages + self.zerox_kwargs = zerox_kwargs + + @staticmethod + def _is_valid_url(url: str) -> bool: + """Check if the url is valid.""" + parsed = urlparse(url) + return bool(parsed.netloc) and bool(parsed.scheme) + + def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type] + """Lazily parse the blob. + + Args: + blob: The blob to parse. + + Raises: + ImportError: If the `py-zerox` package is not installed. + + Yields: + An iterator over the parsed documents. + """ + try: + from pyzerox import zerox + except ImportError: + raise ImportError( + "Could not import pyzerox python package. " + "Please install it with `pip install py-zerox`." + ) + temp_file = None + try: + if not ZeroxPDFParser._is_valid_url(str(blob.path)): + temp_file = NamedTemporaryFile() + with open(temp_file.name, "wb") as f: + f.write(blob.as_bytes()) + file_path = temp_file.name else: - warnings.warn("Unknown PDF Filter!") + file_path = str(blob.path) + + with blob.as_bytes_io() as pdf_file_obj: + doc_metadata = _purge_metadata(self._get_metadata(pdf_file_obj)) - return extract_from_images_with_rapidocr(images) + doc_metadata["source"] = blob.source or blob.path + zerox_prompt = self.custom_system_prompt + + if not zerox_prompt and self.images_parser or self.extract_tables: + prompt_tables = ZeroxPDFParser._map_extract_tables[self.extract_tables] + clazz = self.images_parser.__class__ + if clazz in ZeroxPDFParser._map_extract_images: + prompt_images = ZeroxPDFParser._map_extract_images[clazz] + else: + if not ZeroxPDFParser._warn_creator: + ZeroxPDFParser._warn_creator = True + logger.warning("images_parser can not be simulate") + prompt_images = "" + zerox_prompt = self.custom_system_prompt.format( + prompt_tables=prompt_tables, prompt_images=prompt_images + ) + zerox_output = asyncio.run( + zerox( + file_path=str(file_path), + model=self.model, + cleanup=self.cleanup, + concurrency=self.concurrency, + maintain_format=self.maintain_format, + custom_system_prompt=zerox_prompt, + select_pages=self.select_pages, + **self.zerox_kwargs, + ) + ) + + # Convert zerox output to Document instances and yield them + if len(zerox_output.pages) > 0: + doc_metadata = _purge_metadata( + { + "producer": "ZeroxPDF", + "creator": "ZeroxPDF", + "creationdate": "", + } + | doc_metadata + | { + "total_pages": zerox_output.pages[-1].page, + "num_pages": zerox_output.pages[-1].page, # Deprecated + } + ) + single_texts = [] + for page in zerox_output.pages: + text_from_page = page.content + images_from_page = "" + all_text = _merge_text_and_extras( + [images_from_page], text_from_page + ) + if self.mode == "page": + yield Document( + page_content=all_text, + metadata=_validate_metadata( + doc_metadata | {"page": page.page - 1} + ), + ) + else: + single_texts.append(all_text) + if self.mode == "single": + yield Document( + page_content=self.pages_delimitor.join(single_texts), + metadata=_validate_metadata(doc_metadata), + ) + finally: + if temp_file: + temp_file.close() + + def _get_metadata( + self, + fp: BinaryIO, + password: str = "", + caching: bool = True, + ) -> dict[str, Any]: + """ + Extract metadata from a PDF file. + + Args: + fp: The file pointer to the PDF file. + password: The password for the PDF file, if encrypted. Defaults to an empty + string. + caching: Whether to cache the PDF structure. Defaults to True. + + Returns: + Metadata of the PDF file. + """ + from pdfminer.pdfpage import PDFDocument, PDFPage, PDFParser + + # Create a PDF parser object associated with the file object. + parser = PDFParser(fp) + # Create a PDF document object that stores the document structure. + doc = PDFDocument(parser, password=password, caching=caching) + metadata = {} + + for info in doc.info: + metadata.update(info) + for k, v in metadata.items(): + try: + metadata[k] = PDFMinerParser.resolve_and_decode(v) + except Exception as e: # pragma: nocover + # This metadata value could not be parsed. Instead of failing the PDF + # read, treat it as a warning only if `strict_metadata=False`. + logger.warning( + '[WARNING] Metadata key "%s" could not be parsed due to ' + "exception: %s", + k, + str(e), + ) + + # Count number of pages. + metadata["total_pages"] = len(list(PDFPage.create_pages(doc))) + + return metadata class AmazonTextractPDFParser(BaseBlobParser): @@ -1523,11 +2202,6 @@ class AmazonTextractPDFParser(BaseBlobParser): This helps most LLMs to achieve better accuracy when processing these texts. - ``Document`` objects are returned with metadata that includes the ``source`` and - a 1-based index of the page number in ``page``. Note that ``page`` represents - the index of the result returned from Textract, not necessarily the as-written - page number in the document. - """ def __init__( @@ -1593,14 +2267,14 @@ def __init__( else: self.boto3_textract_client = client - def lazy_parse(self, blob: Blob) -> Iterator[Document]: + def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type] """Iterates over the Blob pages and returns an Iterator with a Document for each page, like the other parsers If multi-page document, blob.path has to be set to the S3 URI and for single page docs the blob.data is taken """ - url_parse_result = urlparse(str(blob.path)) if blob.path else None + url_parse_result = urlparse(str(blob.path)) if blob.path else None # type: ignore[attr-defined] # Either call with S3 path (multi-page) or with bytes (single-page) if ( url_parse_result @@ -1608,13 +2282,13 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: and url_parse_result.netloc ): textract_response_json = self.tc.call_textract( - input_document=str(blob.path), + input_document=str(blob.path), # type: ignore[attr-defined] features=self.textract_features, boto3_textract_client=self.boto3_textract_client, ) else: textract_response_json = self.tc.call_textract( - input_document=blob.as_bytes(), + input_document=blob.as_bytes(), # type: ignore[attr-defined] features=self.textract_features, call_mode=self.tc.Textract_Call_Mode.FORCE_SYNC, boto3_textract_client=self.boto3_textract_client, @@ -1626,9 +2300,21 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: yield Document( page_content=page.get_text(config=self.linearization_config), metadata={"source": blob.source, "page": idx + 1}, + # type: ignore[attr-defined] ) +@deprecated( + since="0.0.7", + removal="0.4.0", + message="langchain_community.document_loaders.parsers.pdf.DocumentIntelligenceParser" + "and langchain_community.document_loaders.pdf.DocumentIntelligenceLoader" + " are deprecated. Please upgrade to " + "langchain_community.document_loaders.DocumentIntelligenceLoader " + "for any file parsing purpose using Azure Document Intelligence " + "service.", + alternative_import="langchain_community.document_loaders.DocumentIntelligenceLoader", +) class DocumentIntelligenceParser(BaseBlobParser): """Loads a PDF with Azure Document Intelligence (formerly Form Recognizer) and chunks at character level.""" @@ -1645,23 +2331,23 @@ def __init__(self, client: Any, model: str): self.client = client self.model = model - def _generate_docs(self, blob: Blob, result: Any) -> Iterator[Document]: + def _generate_docs(self, blob: Blob, result: Any) -> Iterator[Document]: # type: ignore[valid-type] for p in result.pages: content = " ".join([line.content for line in p.lines]) d = Document( page_content=content, metadata={ - "source": blob.source, + "source": blob.source, # type: ignore[attr-defined] "page": p.page_number, }, ) yield d - def lazy_parse(self, blob: Blob) -> Iterator[Document]: + def lazy_parse(self, blob: Blob) -> Iterator[Document]: # type: ignore[valid-type] """Lazily parse the blob.""" - with blob.as_bytes_io() as file_obj: + with blob.as_bytes_io() as file_obj: # type: ignore[attr-defined] poller = self.client.begin_analyze_document(self.model, file_obj) result = poller.result() diff --git a/libs/community/langchain_community/document_loaders/pdf.py b/libs/community/langchain_community/document_loaders/pdf.py index 6b51e481..e4dced73 100644 --- a/libs/community/langchain_community/document_loaders/pdf.py +++ b/libs/community/langchain_community/document_loaders/pdf.py @@ -11,6 +11,7 @@ TYPE_CHECKING, Any, BinaryIO, + Iterable, Iterator, Literal, Mapping, @@ -30,7 +31,7 @@ from langchain_community.document_loaders.dedoc import DedocBaseLoader from langchain_community.document_loaders.parsers.images import BaseImageBlobParser from langchain_community.document_loaders.parsers.pdf import ( - _DEFAULT_PAGES_DELIMITER, + _DEFAULT_PAGE_DELIMITOR, AmazonTextractPDFParser, DocumentIntelligenceParser, PDFMinerParser, @@ -38,6 +39,7 @@ PyMuPDFParser, PyPDFium2Parser, PyPDFParser, + ZeroxPDFParser, ) from langchain_community.document_loaders.unstructured import UnstructuredFileLoader @@ -47,6 +49,11 @@ logger = logging.getLogger(__file__) +@deprecated( + since="0.3.13", + removal="1.0", + alternative_import="langchain_unstructured.UnstructuredPDFLoader", +) class UnstructuredPDFLoader(UnstructuredFileLoader): """Load `PDF` files using `Unstructured`. @@ -91,7 +98,7 @@ def __init__( def _get_elements(self) -> list: from unstructured.partition.pdf import partition_pdf - return partition_pdf(filename=self.file_path, **self.unstructured_kwargs) + return partition_pdf(filename=self.file_path, **self.unstructured_kwargs) # type: ignore[arg-type] class BasePDFLoader(BaseLoader, ABC): @@ -174,6 +181,11 @@ def source(self) -> str: return self.web_path if self.web_path is not None else self.file_path +@deprecated( + since="0.3.13", + removal="1.0", + alternative_import="langchain_unstructured.UnstructuredPDFLoader", +) class OnlinePDFLoader(BasePDFLoader): """Load online `PDF`.""" @@ -209,7 +221,7 @@ class PyPDFLoader(BasePDFLoader): # headers = None # password = None, mode = "single", - pages_delimiter = "\n\f", + pages_delimitor = "\n\f", # extract_images = True, # images_parser = RapidOCRBlobParser(), ) @@ -241,11 +253,10 @@ def __init__( password: Optional[Union[str, bytes]] = None, headers: Optional[dict] = None, extract_images: bool = False, - *, - mode: Literal["single", "page"] = "page", + *, # Move after the file_path ? images_parser: Optional[BaseImageBlobParser] = None, - images_inner_format: Literal["text", "markdown-img", "html-img"] = "text", - pages_delimiter: str = _DEFAULT_PAGES_DELIMITER, + mode: Literal["single", "page"] = "page", + pages_delimitor: str = _DEFAULT_PAGE_DELIMITOR, extraction_mode: Literal["plain", "layout"] = "plain", extraction_kwargs: Optional[dict] = None, ) -> None: @@ -258,19 +269,12 @@ def __init__( password: Optional password for opening encrypted PDFs. mode: The extraction mode, either "single" for the entire document or "page" for page-wise extraction. - pages_delimiter: A string delimiter to separate pages in single-mode + pages_delimitor: A string delimiter to separate pages in single-mode extraction. extract_images: Whether to extract images from the PDF. images_parser: Optional image blob parser. - images_inner_format: The format for the parsed output. - - "text" = return the content as is - - "markdown-img" = wrap the content into an image markdown link, w/ link - pointing to (`![body)(#)`] - - "html-img" = wrap the content as the `alt` text of an tag and link to - (`{body}`) - extraction_mode: “plain” for legacy functionality, “layout” extract text - in a fixed width format that closely adheres to the rendered layout in - the source pdf + extraction_mode: “plain” for legacy functionality, “layout” for experimental + layout mode functionality extraction_kwargs: Optional additional parameters for the extraction process. @@ -281,11 +285,10 @@ def __init__( super().__init__(file_path, headers=headers) self.parser = PyPDFParser( password=password, - mode=mode, extract_images=extract_images, images_parser=images_parser, - images_inner_format=images_inner_format, - pages_delimiter=pages_delimiter, + mode=mode, + pages_delimitor=pages_delimitor, extraction_mode=extraction_mode, extraction_kwargs=extraction_kwargs, ) @@ -299,9 +302,11 @@ def lazy_load( In this way, a paragraph can be continued on the next page. """ if self.web_path: - blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) + blob = Blob.from_data( # type: ignore[attr-defined] + open(self.file_path, "rb").read(), path=self.web_path + ) else: - blob = Blob.from_path(self.file_path) + blob = Blob.from_path(self.file_path) # type: ignore[attr-defined] yield from self.parser.lazy_parse(blob) @@ -332,7 +337,7 @@ class PyPDFium2Loader(BasePDFLoader): # headers = None # password = None, mode = "single", - pages_delimiter = "\n\f", + pages_delimitor = "\n\f", # extract_images = True, # images_to_text = convert_images_to_text_with_tesseract(), ) @@ -363,11 +368,10 @@ def __init__( file_path: Union[str, PurePath], *, mode: Literal["single", "page"] = "page", - pages_delimiter: str = _DEFAULT_PAGES_DELIMITER, + pages_delimitor: str = _DEFAULT_PAGE_DELIMITOR, password: Optional[str] = None, extract_images: bool = False, images_parser: Optional[BaseImageBlobParser] = None, - images_inner_format: Literal["text", "markdown-img", "html-img"] = "text", headers: Optional[dict] = None, ): """Initialize with a file path. @@ -379,16 +383,14 @@ def __init__( password: Optional password for opening encrypted PDFs. mode: The extraction mode, either "single" for the entire document or "page" for page-wise extraction. - pages_delimiter: A string delimiter to separate pages in single-mode + pages_delimitor: A string delimiter to separate pages in single-mode extraction. extract_images: Whether to extract images from the PDF. images_parser: Optional image blob parser. - images_inner_format: The format for the parsed output. - - "text" = return the content as is - - "markdown-img" = wrap the content into an image markdown link, w/ link - pointing to (`![body)(#)`] - - "html-img" = wrap the content as the `alt` text of an tag and link to - (`{body}`) + extraction_mode: “plain” for legacy functionality, “layout” for experimental + layout mode functionality + extraction_kwargs: Optional additional parameters for the extraction + process. Returns: This class does not directly return data. Use the `load`, `lazy_load` or @@ -400,8 +402,7 @@ def __init__( password=password, extract_images=extract_images, images_parser=images_parser, - images_inner_format=images_inner_format, - pages_delimiter=pages_delimiter, + pages_delimitor=pages_delimitor, ) def lazy_load( @@ -413,12 +414,19 @@ def lazy_load( In this way, a paragraph can be continued on the next page. """ if self.web_path: - blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) + blob = Blob.from_data( # type: ignore[attr-defined] + open(self.file_path, "rb").read(), path=self.web_path + ) else: - blob = Blob.from_path(self.file_path) + blob = Blob.from_path(self.file_path) # type: ignore[attr-defined] yield from self.parser.parse(blob) +@deprecated( + since="0.3.13", + removal="1.0", + alternative="langchain_community.document_loaders.generic.GenericLoader", +) class PyPDFDirectoryLoader(BaseLoader): """Load and parse a directory of PDF files using 'pypdf' library. @@ -585,7 +593,7 @@ class PDFMinerLoader(BasePDFLoader): # headers = None # password = None, mode = "single", - pages_delimiter = "\n\f", + pages_delimitor = "\n\f", # extract_images = True, # images_to_text = convert_images_to_text_with_tesseract(), ) @@ -617,10 +625,9 @@ def __init__( *, password: Optional[str] = None, mode: Literal["single", "page"] = "single", - pages_delimiter: str = _DEFAULT_PAGES_DELIMITER, + pages_delimitor: str = _DEFAULT_PAGE_DELIMITOR, extract_images: bool = False, images_parser: Optional[BaseImageBlobParser] = None, - images_inner_format: Literal["text", "markdown-img", "html-img"] = "text", headers: Optional[dict] = None, concatenate_pages: Optional[bool] = None, ) -> None: @@ -633,16 +640,10 @@ def __init__( password: Optional password for opening encrypted PDFs. mode: The extraction mode, either "single" for the entire document or "page" for page-wise extraction. - pages_delimiter: A string delimiter to separate pages in single-mode + pages_delimitor: A string delimiter to separate pages in single-mode extraction. extract_images: Whether to extract images from the PDF. images_parser: Optional image blob parser. - images_inner_format: The format for the parsed output. - - "text" = return the content as is - - "markdown-img" = wrap the content into an image markdown link, w/ link - pointing to (`![body)(#)`] - - "html-img" = wrap the content as the `alt` text of an tag and link to - (`{body}`) concatenate_pages: Deprecated. If True, concatenate all PDF pages into one a single document. Otherwise, return one document per page. @@ -657,8 +658,7 @@ def __init__( images_parser=images_parser, concatenate_pages=concatenate_pages, mode=mode, - pages_delimiter=pages_delimiter, - images_inner_format=images_inner_format, + pages_delimitor=pages_delimitor, ) def lazy_load( @@ -670,14 +670,19 @@ def lazy_load( In this way, a paragraph can be continued on the next page. """ if self.web_path: - blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) + blob = Blob.from_data( # type: ignore[attr-defined] + open(self.file_path, "rb").read(), path=self.web_path + ) else: - blob = Blob.from_path(self.file_path) + blob = Blob.from_path(self.file_path) # type: ignore[attr-defined] yield from self.parser.lazy_parse(blob) class PDFMinerPDFasHTMLLoader(BasePDFLoader): - """Load `PDF` files as HTML content using `PDFMiner`.""" + """Load `PDF` files as HTML content using `PDFMiner`. + Warning, the HTML output is just a positioning of the boxes, + without being able to interpret the HTML in an LLM. + """ def __init__( self, file_path: Union[str, PurePath], *, headers: Optional[dict] = None @@ -741,7 +746,7 @@ class PyMuPDFLoader(BasePDFLoader): # headers = None # password = None, mode = "single", - pages_delimiter = "\n\f", + pages_delimitor = "\n\f", # extract_images = True, # images_parser = TesseractBlobParser(), # extract_tables = "markdown", @@ -775,10 +780,9 @@ def __init__( *, password: Optional[str] = None, mode: Literal["single", "page"] = "page", - pages_delimiter: str = _DEFAULT_PAGES_DELIMITER, + pages_delimitor: str = _DEFAULT_PAGE_DELIMITOR, extract_images: bool = False, images_parser: Optional[BaseImageBlobParser] = None, - images_inner_format: Literal["text", "markdown-img", "html-img"] = "text", extract_tables: Union[Literal["csv", "markdown", "html"], None] = None, headers: Optional[dict] = None, extract_tables_settings: Optional[dict[str, Any]] = None, @@ -793,16 +797,10 @@ def __init__( password: Optional password for opening encrypted PDFs. mode: The extraction mode, either "single" for the entire document or "page" for page-wise extraction. - pages_delimiter: A string delimiter to separate pages in single-mode + pages_delimitor: A string delimiter to separate pages in single-mode extraction. extract_images: Whether to extract images from the PDF. images_parser: Optional image blob parser. - images_inner_format: The format for the parsed output. - - "text" = return the content as is - - "markdown-img" = wrap the content into an image markdown link, w/ link - pointing to (`![body)(#)`] - - "html-img" = wrap the content as the `alt` text of an tag and link to - (`{body}`) extract_tables: Whether to extract tables in a specific format, such as "csv", "markdown", or "html". extract_tables_settings: Optional dictionary of settings for customizing @@ -823,11 +821,10 @@ def __init__( self.parser = PyMuPDFParser( password=password, mode=mode, - pages_delimiter=pages_delimiter, + pages_delimitor=pages_delimitor, text_kwargs=kwargs, extract_images=extract_images, images_parser=images_parser, - images_inner_format=images_inner_format, extract_tables=extract_tables, extract_tables_settings=extract_tables_settings, ) @@ -844,9 +841,9 @@ def _lazy_load(self, **kwargs: Any) -> Iterator[Document]: ) parser = self.parser if self.web_path: - blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) + blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) # type: ignore[attr-defined] else: - blob = Blob.from_path(self.file_path) + blob = Blob.from_path(self.file_path) # type: ignore[attr-defined] yield from parser._lazy_parse(blob, text_kwargs=kwargs) def load(self, **kwargs: Any) -> list[Document]: @@ -1007,7 +1004,59 @@ def load(self) -> list[Document]: class PDFPlumberLoader(BasePDFLoader): - """Load `PDF` files using `pdfplumber`.""" + """Load and parse a PDF file using 'pdfplumber' library. + + This class provides methods to load and parse PDF documents, supporting various + configurations such as handling password-protected files, extracting images, and + defining extraction mode. It integrates the `pdfplumber` library for PDF processing + and offers both synchronous and asynchronous document loading. + + Examples: + Setup: + + .. code-block:: bash + + pip install -U langchain-community pdfplumber + + Instantiate the loader: + + .. code-block:: python + + from langchain_community.document_loaders import PDFPlumberLoader + + loader = PDFPlumberLoader( + file_path = "./example_data/layout-parser-paper.pdf", + # headers = None + # password = None, + mode = "single", + pages_delimitor = "\n\f", + # images_to_text = convert_images_to_text_with_tesseract(), + # extract_tables = None, + # extract_tables_settings = None, + # text_kwargs = {"use_text_flow": False, "keep_blank_chars": False}, + # dedupe = False, + ) + + Lazy load documents: + + .. code-block:: python + + docs = [] + docs_lazy = loader.lazy_load() + + for doc in docs_lazy: + docs.append(doc) + print(docs[0].page_content[:100]) + print(docs[0].metadata) + + Load documents asynchronously: + + .. code-block:: python + + docs = await loader.aload() + print(docs[0].page_content[:100]) + print(docs[0].metadata) + """ def __init__( self, @@ -1016,34 +1065,70 @@ def __init__( dedupe: bool = False, headers: Optional[dict] = None, extract_images: bool = False, + *, + password: Optional[str] = None, + mode: Literal["single", "page"] = "page", + images_parser: Optional[BaseImageBlobParser] = None, + pages_delimitor: str = _DEFAULT_PAGE_DELIMITOR, + extract_tables: Optional[Literal["csv", "markdown", "html"]] = None, + extract_tables_settings: Optional[dict[str, Any]] = None, ) -> None: - """Initialize with a file path.""" - try: - import pdfplumber # noqa:F401 - except ImportError: - raise ImportError( - "pdfplumber package not found, please install it with " - "`pip install pdfplumber`" - ) + """Initialize with a file path. - super().__init__(file_path, headers=headers) - self.text_kwargs = text_kwargs or {} - self.dedupe = dedupe - self.extract_images = extract_images + Args: + file_path: The path to the PDF file to be loaded. + headers: Optional headers to use for GET request to download a file from a + web path. + password: Optional password for opening encrypted PDFs. + mode: The extraction mode, either "single" for the entire document or "page" + for page-wise extraction. + pages_delimitor: A string delimiter to separate pages in single-mode + extraction. + extract_images: Whether to extract images from the PDF. + images_parser: Optional image blob parser. + extract_tables: Whether to extract tables in a specific format, such as + "csv", "markdown", or "html". + extract_tables_settings: Optional dictionary of settings for customizing + table extraction. + text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()`` + dedupe: Avoiding the error of duplicate characters if `dedupe=True` - def load(self) -> list[Document]: - """Load file.""" + Returns: + This method does not directly return data. Use the `load`, `lazy_load`, + or `aload` methods + to retrieve parsed documents with content and metadata. - parser = PDFPlumberParser( - text_kwargs=self.text_kwargs, - dedupe=self.dedupe, - extract_images=self.extract_images, + Raises: + ImportError: If the `pdfplumber` package is not installed. + """ + super().__init__(file_path, headers=headers) + self.parser = PDFPlumberParser( + password=password, + mode=mode, + pages_delimitor=pages_delimitor, + extract_images=extract_images, + images_parser=images_parser, + extract_tables=extract_tables, + text_kwargs=text_kwargs, + extract_tables_settings=extract_tables_settings, + dedupe=dedupe, ) + + def lazy_load( + self, + ) -> Iterator[Document]: + """ + Lazy load given path as pages. + Insert image, if possible, between two paragraphs. + In this way, a paragraph can be continued on the next page. + """ if self.web_path: - blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path) + blob = Blob.from_data( # type: ignore[attr-defined] + open(self.file_path, "rb").read(), path=self.web_path + ) else: - blob = Blob.from_path(self.file_path) - return parser.parse(blob) + blob = Blob.from_path(self.file_path) # type: ignore[attr-defined] + yield from self.parser.lazy_parse(blob) class AmazonTextractPDFLoader(BasePDFLoader): @@ -1153,11 +1238,11 @@ def lazy_load( ) -> Iterator[Document]: """Lazy load documents""" # the self.file_path is local, but the blob has to include - # the S3 location if the file originated from S3 for multipage documents - # raises ValueError when multipage and not on S3""" + # the S3 location if the file originated from S3 for multi-page documents + # raises ValueError when multi-page and not on S3""" if self.web_path and self._is_s3_url(self.web_path): - blob = Blob(path=self.web_path) + blob = Blob(path=self.web_path) # type: ignore[call-arg] # type: ignore[misc] else: blob = Blob.from_path(self.file_path) if AmazonTextractPDFLoader._get_number_of_pages(blob) > 1: @@ -1170,7 +1255,7 @@ def lazy_load( yield from self.parser.parse(blob) @staticmethod - def _get_number_of_pages(blob: Blob) -> int: + def _get_number_of_pages(blob: Blob) -> int: # type: ignore[valid-type] try: import pypdf from PIL import Image, ImageSequence @@ -1180,20 +1265,22 @@ def _get_number_of_pages(blob: Blob) -> int: "Could not import pypdf or Pilloe python package. " "Please install it with `pip install pypdf Pillow`." ) - if blob.mimetype == "application/pdf": - with blob.as_bytes_io() as input_pdf_file: + if blob.mimetype == "application/pdf": # type: ignore[attr-defined] + with blob.as_bytes_io() as input_pdf_file: # type: ignore[attr-defined] pdf_reader = pypdf.PdfReader(input_pdf_file) return len(pdf_reader.pages) - elif blob.mimetype == "image/tiff": + elif blob.mimetype == "image/tiff": # type: ignore[attr-defined] num_pages = 0 - img = Image.open(blob.as_bytes()) + img = Image.open(blob.as_bytes()) # type: ignore[attr-defined] for _, _ in enumerate(ImageSequence.Iterator(img)): num_pages += 1 return num_pages - elif blob.mimetype in ["image/png", "image/jpeg"]: + elif blob.mimetype in ["image/png", "image/jpeg"]: # type: ignore[attr-defined] return 1 else: - raise ValueError(f"unsupported mime type: {blob.mimetype}") + raise ValueError( # type: ignore[attr-defined] + f"unsupported mime type: {blob.mimetype}" + ) class DedocPDFLoader(DedocBaseLoader): @@ -1340,78 +1427,184 @@ def lazy_load( self, ) -> Iterator[Document]: """Lazy load given path as pages.""" - blob = Blob.from_path(self.file_path) + blob = Blob.from_path(self.file_path) # type: ignore[attr-defined] yield from self.parser.parse(blob) class ZeroxPDFLoader(BasePDFLoader): - """Document loader utilizing Zerox library: + """Load and parse a PDF file using 'py-zerox' library. https://github.com/getomni-ai/zerox - Zerox converts PDF document to series of images (page-wise) and + This class provides methods to load and parse PDF documents, supporting various + configurations such as handling password-protected files, extracting tables, + extracting images, and defining extraction mode. It integrates the `py-zerox` + library for PDF processing and offers both synchronous and asynchronous document + loading. + + Zerox converts PDF document to serties of images (page-wise) and uses vision-capable LLM model to generate Markdown representation. - Zerox utilizes anyc operations. Therefore when using this loader + Zerox utilizes async operations. Therefore when using this loader inside Jupyter Notebook (or any environment running async) you will need to: ```python import nest_asyncio nest_asyncio.apply() ``` + + Examples: + Setup: + + .. code-block:: bash + + pip install -U langchain-community pymupdf + + Instantiate the loader: + + .. code-block:: python + + from langchain_community.document_loaders import ZeroxPDFLoader + + loader = ZeroxPDFLoader( + file_path = "./example_data/layout-parser-paper.pdf", + # headers = None + # password = None, + mode = "single", + pages_delimitor = "\n\f", + # extract_images = True, + # images_to_text = convert_images_to_text_with_tesseract(), + # extract_tables = "markdown", + # extract_tables_settings = None, + ) + + Lazy load documents: + + .. code-block:: python + + docs = [] + docs_lazy = loader.lazy_load() + + for doc in docs_lazy: + docs.append(doc) + print(docs[0].page_content[:100]) + print(docs[0].metadata) + + Load documents asynchronously: + + .. code-block:: python + + docs = await loader.aload() + print(docs[0].page_content[:100]) + print(docs[0].metadata) """ def __init__( self, - file_path: Union[str, PurePath], + file_path: Union[str, Path], + *, + headers: Optional[dict] = None, + mode: Literal["single", "page"] = "page", + pages_delimitor: str = _DEFAULT_PAGE_DELIMITOR, + images_parser: Optional[BaseImageBlobParser] = None, + extract_images: bool = True, + extract_tables: Union[Literal["markdown", "html"], None] = "markdown", + cleanup: bool = True, + concurrency: int = 10, + maintain_format: bool = False, model: str = "gpt-4o-mini", - **zerox_kwargs: Any, + custom_system_prompt: Optional[str] = None, + select_pages: Optional[Union[int, Iterable[int]]] = None, + **zerox_kwargs: dict[str, Any], ) -> None: - super().__init__(file_path=file_path) - """Initialize the parser with arguments to be passed to the zerox function. + """ + Initialize the loader with arguments to be passed to the zerox function. Make sure to set necessary environment variables such as API key, endpoint, etc. Check zerox documentation for list of necessary environment variables for any given model. Args: - file_path: - Path or url of the pdf file + file_path: The path to the PDF file to be loaded. + headers: Optional headers to use for GET request to download a file from a + web path. + password: Optional password for opening encrypted PDFs. + mode: The extraction mode, either "single" for the entire document or "page" + for page-wise extraction. + pages_delimitor: A string delimiter to separate pages in single-mode + extraction. + extract_images: Whether to extract images from the PDF. + images_parser: Optional image blob parser. + extract_tables: Whether to extract tables in a specific format, such as + "csv", "markdown", or "html". + extract_tables_settings: Optional dictionary of settings for customizing + table extraction. + cleanup: + Whether to cleanup the temporary files after processing, defaults + to True + concurrency: + The number of concurrent processes to run, defaults to 10 + maintain_format: + Whether to maintain the format from the previous page, defaults to False model: - Vision capable model to use. Defaults to "gpt-4o-mini". - Hosted models are passed in format "/" - Examples: "azure/gpt-4o-mini", "vertex_ai/gemini-1.5-flash-001" - See more details in zerox documentation. - **zerox_kwargs: + The model to use for generating completions, defaults to "gpt-4o-mini". + Note - Refer: https://docs.litellm.ai/docs/providers to pass correct + model name as according to provider it might be different from + actual name. + output_dir: + The directory to save the markdown output, defaults to None + temp_dir: + The directory to store temporary files, defaults to some named folder + in system's temp directory. If already exists, the contents will be + deleted for zerox uses it. + custom_system_prompt: + The system prompt to use for the model, this overrides the default + system prompt of zerox. Generally it is not required unless you want + some specific behaviour. When set, it will raise a friendly warning, + defaults to None + select_pages: + Pages to process, can be a single page number or an iterable of page + numbers, defaults to None + **kwargs: Arguments specific to the zerox function. - see datailed list of arguments here in zerox repository: - https://github.com/getomni-ai/zerox/blob/main/py_zerox/pyzerox/core/zerox.py#L25 - """ # noqa: E501 - self.zerox_kwargs = zerox_kwargs - self.model = model + """ + super().__init__(file_path, headers=headers) + self.parser = ZeroxPDFParser( + mode=mode, + pages_delimitor=pages_delimitor, + images_parser=images_parser, + extract_images=extract_images, + extract_tables=extract_tables, + cleanup=cleanup, + concurrency=concurrency, + maintain_format=maintain_format, + model=model, + custom_system_prompt=custom_system_prompt, + select_pages=select_pages, + **zerox_kwargs, + ) def lazy_load(self) -> Iterator[Document]: - """Lazily load pages.""" - import asyncio - - from pyzerox import zerox - - # Directly call asyncio.run to execute zerox synchronously - zerox_output = asyncio.run( - zerox(file_path=str(self.file_path), model=self.model, **self.zerox_kwargs) - ) + """ + Loads documents from pdf utilizing zerox library: + https://github.com/getomni-ai/zerox - # Convert zerox output to Document instances and yield them - if len(zerox_output.pages) > 0: - num_pages = zerox_output.pages[-1].page - for page in zerox_output.pages: - yield Document( - page_content=page.content, - metadata={ - "source": self.source, - "page": page.page, - "num_pages": num_pages, - }, - ) + Returns: + Iterator[Document]: An iterator over parsed Document instances. + """ + """Lazy load given path as pages.""" + if self.web_path: + blob = Blob.from_data( # type: ignore[attr-defined] + open(self.file_path, "rb").read(), path=self.web_path + ) + else: + blob = Blob.from_path(self.file_path) # type: ignore[attr-defined] + yield from self.parser.lazy_parse(blob) # Legacy: only for backwards compatibility. Use PyPDFLoader instead -PagedPDFSplitter = PyPDFLoader +@deprecated( + since="0.0.30", + removal="1.0", + alternative="PyPDFLoader", +) +class PagedPDFSplitter(PyPDFLoader): + pass From d2ac13b448289966eae7674ce82808bc6b2adc05 Mon Sep 17 00:00:00 2001 From: Philippe Prados Date: Wed, 14 May 2025 14:56:25 +0200 Subject: [PATCH 2/2] Add message for PyPDFDirectoryLoader --- libs/community/langchain_community/document_loaders/pdf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/libs/community/langchain_community/document_loaders/pdf.py b/libs/community/langchain_community/document_loaders/pdf.py index 33e35917..3d118913 100644 --- a/libs/community/langchain_community/document_loaders/pdf.py +++ b/libs/community/langchain_community/document_loaders/pdf.py @@ -434,6 +434,7 @@ def lazy_load( since="0.3.24", removal="1.0", alternative="langchain_community.document_loaders.generic.GenericLoader", + message="Use GenericLoader and PyPDFParser instead." ) class PyPDFDirectoryLoader(BaseLoader): """Load and parse a directory of PDF files using 'pypdf' library.