| 
12 | 12 |     Any,  | 
13 | 13 |     BinaryIO,  | 
14 | 14 |     Iterator,  | 
 | 15 | +    Literal,  | 
15 | 16 |     Mapping,  | 
16 | 17 |     Optional,  | 
17 | 18 |     Sequence,  | 
 | 
28 | 29 | from langchain_community.document_loaders.blob_loaders import Blob  | 
29 | 30 | from langchain_community.document_loaders.dedoc import DedocBaseLoader  | 
30 | 31 | from langchain_community.document_loaders.parsers.pdf import (  | 
 | 32 | +    CONVERT_IMAGE_TO_TEXT,  | 
31 | 33 |     AmazonTextractPDFParser,  | 
32 | 34 |     DocumentIntelligenceParser,  | 
33 | 35 |     PDFMinerParser,  | 
34 | 36 |     PDFPlumberParser,  | 
35 | 37 |     PyMuPDFParser,  | 
36 | 38 |     PyPDFium2Parser,  | 
37 | 39 |     PyPDFParser,  | 
 | 40 | +    _default_page_delimitor,  | 
38 | 41 | )  | 
39 | 42 | from langchain_community.document_loaders.unstructured import UnstructuredFileLoader  | 
40 | 43 | 
 
  | 
@@ -96,7 +99,8 @@ def __init__(  | 
96 | 99 |         if "~" in self.file_path:  | 
97 | 100 |             self.file_path = os.path.expanduser(self.file_path)  | 
98 | 101 | 
 
  | 
99 |  | -        # If the file is a web path or S3, download it to a temporary file, and use that  | 
 | 102 | +        # If the file is a web path or S3, download it to a temporary file,  | 
 | 103 | +        # and use that. It's better to use a BlobLoader.  | 
100 | 104 |         if not os.path.isfile(self.file_path) and self._is_valid_url(self.file_path):  | 
101 | 105 |             self.temp_dir = tempfile.TemporaryDirectory()  | 
102 | 106 |             _, suffix = os.path.splitext(self.file_path)  | 
@@ -412,51 +416,129 @@ def lazy_load(self) -> Iterator[Document]:  | 
412 | 416 | 
 
  | 
413 | 417 | 
 
  | 
414 | 418 | class PyMuPDFLoader(BasePDFLoader):  | 
415 |  | -    """Load `PDF` files using `PyMuPDF`."""  | 
 | 419 | +    """Load and parse a PDF file using 'PyMuPDF' library.  | 
 | 420 | +
  | 
 | 421 | +    This class provides methods to load and parse PDF documents, supporting various  | 
 | 422 | +    configurations such as handling password-protected files, extracting tables,  | 
 | 423 | +    extracting images, and defining extraction mode. It integrates the `PyMuPDF`  | 
 | 424 | +    library for PDF processing and offers both synchronous and asynchronous document  | 
 | 425 | +    loading.  | 
 | 426 | +
  | 
 | 427 | +    Examples:  | 
 | 428 | +        Setup:  | 
 | 429 | +
  | 
 | 430 | +        .. code-block:: bash  | 
 | 431 | +
  | 
 | 432 | +            pip install -U langchain-community pymupdf  | 
 | 433 | +
  | 
 | 434 | +        Instantiate the loader:  | 
 | 435 | +
  | 
 | 436 | +        .. code-block:: python  | 
 | 437 | +
  | 
 | 438 | +            from langchain_community.document_loaders import PyMuPDFLoader  | 
 | 439 | +
  | 
 | 440 | +            loader = PyMuPDFLoader(  | 
 | 441 | +                file_path = "./example_data/layout-parser-paper.pdf",  | 
 | 442 | +                # headers = None  | 
 | 443 | +                # password = None,  | 
 | 444 | +                mode = "single",  | 
 | 445 | +                pages_delimitor = "\n\f",  | 
 | 446 | +                # extract_images = True,  | 
 | 447 | +                # images_to_text = convert_images_to_text_with_tesseract(),  | 
 | 448 | +                # extract_tables = "markdown",  | 
 | 449 | +                # extract_tables_settings = None,  | 
 | 450 | +            )  | 
 | 451 | +
  | 
 | 452 | +        Lazy load documents:  | 
 | 453 | +
  | 
 | 454 | +        .. code-block:: python  | 
 | 455 | +
  | 
 | 456 | +            docs = []  | 
 | 457 | +            docs_lazy = loader.lazy_load()  | 
 | 458 | +
  | 
 | 459 | +            for doc in docs_lazy:  | 
 | 460 | +                docs.append(doc)  | 
 | 461 | +            print(docs[0].page_content[:100])  | 
 | 462 | +            print(docs[0].metadata)  | 
 | 463 | +
  | 
 | 464 | +        Load documents asynchronously:  | 
 | 465 | +
  | 
 | 466 | +        .. code-block:: python  | 
 | 467 | +
  | 
 | 468 | +            docs = await loader.aload()  | 
 | 469 | +            print(docs[0].page_content[:100])  | 
 | 470 | +            print(docs[0].metadata)  | 
 | 471 | +    """  | 
416 | 472 | 
 
  | 
417 | 473 |     def __init__(  | 
418 | 474 |         self,  | 
419 | 475 |         file_path: Union[str, PurePath],  | 
420 | 476 |         *,  | 
421 |  | -        headers: Optional[dict] = None,  | 
 | 477 | +        password: Optional[str] = None,  | 
 | 478 | +        mode: Literal["single", "page"] = "page",  | 
 | 479 | +        pages_delimitor: str = _default_page_delimitor,  | 
422 | 480 |         extract_images: bool = False,  | 
 | 481 | +        images_to_text: CONVERT_IMAGE_TO_TEXT = None,  | 
 | 482 | +        extract_tables: Union[Literal["csv", "markdown", "html"], None] = None,  | 
 | 483 | +        headers: Optional[dict] = None,  | 
 | 484 | +        extract_tables_settings: Optional[dict[str, Any]] = None,  | 
423 | 485 |         **kwargs: Any,  | 
424 | 486 |     ) -> None:  | 
425 |  | -        """Initialize with a file path."""  | 
426 |  | -        try:  | 
427 |  | -            import fitz  # noqa:F401  | 
428 |  | -        except ImportError:  | 
429 |  | -            raise ImportError(  | 
430 |  | -                "`PyMuPDF` package not found, please install it with "  | 
431 |  | -                "`pip install pymupdf`"  | 
432 |  | -            )  | 
433 |  | -        super().__init__(file_path, headers=headers)  | 
434 |  | -        self.extract_images = extract_images  | 
435 |  | -        self.text_kwargs = kwargs  | 
 | 487 | +        """Initialize with a file path.  | 
436 | 488 | 
  | 
437 |  | -    def _lazy_load(self, **kwargs: Any) -> Iterator[Document]:  | 
438 |  | -        if kwargs:  | 
439 |  | -            logger.warning(  | 
440 |  | -                f"Received runtime arguments {kwargs}. Passing runtime args to `load`"  | 
441 |  | -                f" is deprecated. Please pass arguments during initialization instead."  | 
442 |  | -            )  | 
 | 489 | +        Args:  | 
 | 490 | +            file_path: The path to the PDF file to be loaded.  | 
 | 491 | +            headers: Optional headers to use for GET request to download a file from a  | 
 | 492 | +              web path.  | 
 | 493 | +            password: Optional password for opening encrypted PDFs.  | 
 | 494 | +            mode: The extraction mode, either "single" for the entire document or "page"  | 
 | 495 | +                for page-wise extraction.  | 
 | 496 | +            pages_delimitor: A string delimiter to separate pages in single-mode  | 
 | 497 | +                extraction.  | 
 | 498 | +            extract_images: Whether to extract images from the PDF.  | 
 | 499 | +            images_to_text: Optional function or callable to convert images to text  | 
 | 500 | +                during extraction.  | 
 | 501 | +            extract_tables: Whether to extract tables in a specific format, such as  | 
 | 502 | +                "csv", "markdown", or "html".  | 
 | 503 | +            extract_tables_settings: Optional dictionary of settings for customizing  | 
 | 504 | +                table extraction.  | 
 | 505 | +            **kwargs: Additional keyword arguments for customizing text extraction  | 
 | 506 | +                behavior.  | 
 | 507 | +
  | 
 | 508 | +        Returns:  | 
 | 509 | +            This method does not directly return data. Use the `load`, `lazy_load`, or  | 
 | 510 | +            `aload` methods to retrieve parsed documents with content and metadata.  | 
443 | 511 | 
  | 
444 |  | -        text_kwargs = {**self.text_kwargs, **kwargs}  | 
445 |  | -        parser = PyMuPDFParser(  | 
446 |  | -            text_kwargs=text_kwargs, extract_images=self.extract_images  | 
 | 512 | +        Raises:  | 
 | 513 | +            ValueError: If the `mode` argument is not one of "single" or "page".  | 
 | 514 | +        """  | 
 | 515 | +        if mode not in ["single", "page"]:  | 
 | 516 | +            raise ValueError("mode must be single or page")  | 
 | 517 | +        super().__init__(file_path, headers=headers)  | 
 | 518 | +        self.parser = PyMuPDFParser(  | 
 | 519 | +            password=password,  | 
 | 520 | +            mode=mode,  | 
 | 521 | +            pages_delimitor=pages_delimitor,  | 
 | 522 | +            text_kwargs=kwargs,  | 
 | 523 | +            extract_images=extract_images,  | 
 | 524 | +            images_to_text=images_to_text,  | 
 | 525 | +            extract_tables=extract_tables,  | 
 | 526 | +            extract_tables_settings=extract_tables_settings,  | 
447 | 527 |         )  | 
 | 528 | + | 
 | 529 | +    def lazy_load(self) -> Iterator[Document]:  | 
 | 530 | +        """  | 
 | 531 | +        Lazy load given path as pages.  | 
 | 532 | +        Insert image, if possible, between two paragraphs.  | 
 | 533 | +        In this way, a paragraph can be continued on the next page.  | 
 | 534 | +        """  | 
 | 535 | +        parser = self.parser  | 
448 | 536 |         if self.web_path:  | 
449 | 537 |             blob = Blob.from_data(open(self.file_path, "rb").read(), path=self.web_path)  # type: ignore[attr-defined]  | 
450 | 538 |         else:  | 
451 | 539 |             blob = Blob.from_path(self.file_path)  # type: ignore[attr-defined]  | 
452 | 540 |         yield from parser.lazy_parse(blob)  | 
453 | 541 | 
 
  | 
454 |  | -    def load(self, **kwargs: Any) -> list[Document]:  | 
455 |  | -        return list(self._lazy_load(**kwargs))  | 
456 |  | - | 
457 |  | -    def lazy_load(self) -> Iterator[Document]:  | 
458 |  | -        yield from self._lazy_load()  | 
459 |  | - | 
460 | 542 | 
 
  | 
461 | 543 | # MathpixPDFLoader implementation taken largely from Daniel Gross's:  | 
462 | 544 | # https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21  | 
 | 
0 commit comments