langchain-ai · eyurtsev · Jan 20, 2025 · Jan 2, 2025 · Jan 7, 2025 · Jan 7, 2025
diff --git a/docs/docs/integrations/document_loaders/pymupdf.ipynb b/docs/docs/integrations/document_loaders/pymupdf.ipynb
diff --git a/libs/community/extended_testing_deps.txt b/libs/community/extended_testing_deps.txt
@@ -60,12 +60,14 @@ oracle-ads>=2.9.1,<3
 oracledb>=2.2.0,<3
 pandas>=2.0.1,<3
 pdfminer-six>=20221105,<20240706
+pdfplumber>=0.11
 pgvector>=0.1.6,<0.2
 playwright>=1.48.0,<2
 praw>=7.7.1,<8
 premai>=0.3.25,<0.4
 psychicapi>=0.8.0,<0.9
 pydantic>=2.7.4,<3
+pytesseract>=0.3.13
 py-trello>=0.19.0,<0.20
 pyjwt>=2.8.0,<3
 pymupdf>=1.22.3,<2

diff --git a/libs/community/langchain_community/document_loaders/parsers/__init__.py b/libs/community/langchain_community/document_loaders/parsers/__init__.py
@@ -17,6 +17,12 @@
     from langchain_community.document_loaders.parsers.html import (
         BS4HTMLParser,
     )
+    from langchain_community.document_loaders.parsers.images import (
+        BaseImageBlobParser,
+        LLMImageBlobParser,
+        RapidOCRBlobParser,
+        TesseractBlobParser,
+    )
     from langchain_community.document_loaders.parsers.language import (
         LanguageParser,
     )
@@ -35,15 +41,19 @@
 _module_lookup = {
     "AzureAIDocumentIntelligenceParser": "langchain_community.document_loaders.parsers.doc_intelligence",  # noqa: E501
     "BS4HTMLParser": "langchain_community.document_loaders.parsers.html",
+    "BaseImageBlobParser": "langchain_community.document_loaders.parsers.images",
     "DocAIParser": "langchain_community.document_loaders.parsers.docai",
     "GrobidParser": "langchain_community.document_loaders.parsers.grobid",
     "LanguageParser": "langchain_community.document_loaders.parsers.language",
+    "LLMImageBlobParser": "langchain_community.document_loaders.parsers.images",
     "OpenAIWhisperParser": "langchain_community.document_loaders.parsers.audio",
     "PDFMinerParser": "langchain_community.document_loaders.parsers.pdf",
     "PDFPlumberParser": "langchain_community.document_loaders.parsers.pdf",
     "PyMuPDFParser": "langchain_community.document_loaders.parsers.pdf",
     "PyPDFParser": "langchain_community.document_loaders.parsers.pdf",
     "PyPDFium2Parser": "langchain_community.document_loaders.parsers.pdf",
+    "RapidOCRBlobParser": "langchain_community.document_loaders.parsers.images",
+    "TesseractBlobParser": "langchain_community.document_loaders.parsers.images",
     "VsdxParser": "langchain_community.document_loaders.parsers.vsdx",
 }
 
@@ -57,15 +67,19 @@ def __getattr__(name: str) -> Any:
 
 __all__ = [
     "AzureAIDocumentIntelligenceParser",
+    "BaseImageBlobParser",
     "BS4HTMLParser",
     "DocAIParser",
     "GrobidParser",
     "LanguageParser",
+    "LLMImageBlobParser",
     "OpenAIWhisperParser",
     "PDFMinerParser",
     "PDFPlumberParser",
     "PyMuPDFParser",
     "PyPDFParser",
     "PyPDFium2Parser",
+    "RapidOCRBlobParser",
+    "TesseractBlobParser",
     "VsdxParser",
 ]
diff --git a/libs/community/langchain_community/document_loaders/parsers/images.py b/libs/community/langchain_community/document_loaders/parsers/images.py
@@ -0,0 +1,220 @@
+import base64
+import io
+import logging
+from abc import abstractmethod
+from typing import TYPE_CHECKING, Iterable, Iterator
+
+import numpy
+import numpy as np
+from langchain_core.documents import Document
+from langchain_core.language_models import BaseChatModel
+from langchain_core.messages import HumanMessage
+
+from langchain_community.document_loaders.base import BaseBlobParser
+from langchain_community.document_loaders.blob_loaders import Blob
+
+if TYPE_CHECKING:
+    from PIL.Image import Image
+
+logger = logging.getLogger(__name__)
+
+
+class BaseImageBlobParser(BaseBlobParser):
+    """Abstract base class for parsing image blobs into text."""
+
+    @abstractmethod
+    def _analyze_image(self, img: "Image") -> str:
+        """Abstract method to analyze an image and extract textual content.
+
+        Args:
+            img: The image to be analyzed.
+
+        Returns:
+          The extracted text content.
+        """
+
+    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
+        """Lazily parse a blob and yields Documents containing the parsed content.
+
+        Args:
+            blob (Blob): The blob to be parsed.
+
+        Yields:
+            Document:
+              A document containing the parsed content and metadata.
+        """
+        try:
+            from PIL import Image as Img
+
+            with blob.as_bytes_io() as buf:
+                if blob.mimetype == "application/x-npy":
+                    img = Img.fromarray(numpy.load(buf))
+                else:
+                    img = Img.open(buf)
+                content = self._analyze_image(img)
+                logger.debug("Image text: %s", content.replace("\n", "\\n"))
+                yield Document(
+                    page_content=content,
+                    metadata={**blob.metadata, **{"source": blob.source}},
+                )
+        except ImportError:
+            raise ImportError(
+                "`Pillow` package not found, please install it with "
+                "`pip install Pillow`"
+            )
+
+
+class RapidOCRBlobParser(BaseImageBlobParser):
+    """Parser for extracting text from images using the RapidOCR library.
+
+    Attributes:
+        ocr:
+          The RapidOCR instance for performing OCR.
+    """
+
+    def __init__(
+        self,
+    ) -> None:
+        """
+        Initializes the RapidOCRBlobParser.
+        """
+        super().__init__()
+        self.ocr = None
+
+    def _analyze_image(self, img: "Image") -> str:
+        """
+        Analyzes an image and extracts text using RapidOCR.
+
+        Args:
+            img (Image):
+              The image to be analyzed.
+
+        Returns:
+            str:
+              The extracted text content.
+        """
+        if not self.ocr:
+            try:
+                from rapidocr_onnxruntime import RapidOCR
+
+                self.ocr = RapidOCR()
+            except ImportError:
+                raise ImportError(
+                    "`rapidocr-onnxruntime` package not found, please install it with "
+                    "`pip install rapidocr-onnxruntime`"
+                )
+        ocr_result, _ = self.ocr(np.array(img))  # type: ignore
+        content = ""
+        if ocr_result:
+            content = ("\n".join([text[1] for text in ocr_result])).strip()
+        return content
+
+
+class TesseractBlobParser(BaseImageBlobParser):
+    """Parse for extracting text from images using the Tesseract OCR library."""
+
+    def __init__(
+        self,
+        *,
+        langs: Iterable[str] = ("eng",),
+    ):
+        """Initialize the TesseractBlobParser.
+
+        Args:
+            langs (list[str]):
+              The languages to use for OCR.
+        """
+        super().__init__()
+        self.langs = list(langs)
+
+    def _analyze_image(self, img: "Image") -> str:
+        """Analyze an image and extracts text using Tesseract OCR.
+
+        Args:
+            img: The image to be analyzed.
+
+        Returns:
+            str: The extracted text content.
+        """
+        try:
+            import pytesseract
+        except ImportError:
+            raise ImportError(
+                "`pytesseract` package not found, please install it with "
+                "`pip install pytesseract`"
+            )
+        return pytesseract.image_to_string(img, lang="+".join(self.langs)).strip()
+
+
+_PROMPT_IMAGES_TO_DESCRIPTION: str = (
+    "You are an assistant tasked with summarizing images for retrieval. "
+    "1. These summaries will be embedded and used to retrieve the raw image. "
+    "Give a concise summary of the image that is well optimized for retrieval\n"
+    "2. extract all the text from the image. "
+    "Do not exclude any content from the page.\n"
+    "Format answer in markdown without explanatory text "
+    "and without markdown delimiter ``` at the beginning. "
+)
+
+
+class LLMImageBlobParser(BaseImageBlobParser):
+    """Parser for analyzing images using a language model (LLM).
+
+    Attributes:
+        model (BaseChatModel):
+          The language model to use for analysis.
+        prompt (str):
+          The prompt to provide to the language model.
+    """
+
+    def __init__(
+        self,
+        *,
+        model: BaseChatModel,
+        prompt: str = _PROMPT_IMAGES_TO_DESCRIPTION,
+    ):
+        """Initializes the LLMImageBlobParser.
+
+        Args:
+            model (BaseChatModel):
+              The language model to use for analysis.
+            prompt (str):
+              The prompt to provide to the language model.
+        """
+        super().__init__()
+        self.model = model
+        self.prompt = prompt
+
+    def _analyze_image(self, img: "Image") -> str:
+        """Analyze an image using the provided language model.
+
+        Args:
+            img: The image to be analyzed.
+
+        Returns:
+            The extracted textual content.
+        """
+        image_bytes = io.BytesIO()
+        img.save(image_bytes, format="PNG")
+        img_base64 = base64.b64encode(image_bytes.getvalue()).decode("utf-8")
+        msg = self.model.invoke(
+            [
+                HumanMessage(
+                    content=[
+                        {
+                            "type": "text",
+                            "text": self.prompt.format(format=format),
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/jpeg;base64,{img_base64}"
+                            },
+                        },
+                    ]
+                )
+            ]
+        )
+        result = msg.content
+        assert isinstance(result, str)
+        return result