-
Couldn't load subscription status.
- Fork 19.5k
community[minor]: Refactoring PyMuPDF parser, loader and add image blob parsers #29063
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
47 commits
Select commit
Hold shift + click to select a range
21759e2
Prepare the integration of new versions of PDFLoader.
pprados 4607354
Fix Line too long
pprados 668dc9c
Fix Line too long
pprados 7a5b5c5
Fix Line too long
pprados 6340ded
Fix Line too long
pprados 4845781
Update PyMuPDF
pprados 3beda82
Fix tu
pprados 743a83e
Fix review - step 1
pprados b623750
Fix all remarques
pprados 20f5a41
Merge remote-tracking branch 'upstream/master' into pprados/02-pymupdf
pprados 91234f0
Fix remarques
pprados 80ee3f7
Fix Images
pprados 66f97cf
Merge remote-tracking branch 'upstream/master' into pprados/02-pymupdf
pprados 0e6c904
Fix Images
pprados 9b45bd8
Merge branch 'master' into pprados/02-pymupdf
pprados acf4358
Fix deprecated load() with kwargs
pprados d7d3021
Merge branch 'master' into pprados/02-pymupdf
pprados 4762fab
Change the format for images parser
pprados 6121005
Merge branch 'master' into pprados/02-pymupdf
pprados 5910f99
Merge branch 'master' into pprados/02-pymupdf
pprados 7fc01f3
Merge branch 'master' into pprados/02-pymupdf
pprados 0f654a1
Add format "html" and "markdown" for LLMImageBlobParser
pprados e4f36ed
Merge remote-tracking branch 'origin/pprados/02-pymupdf' into pprados…
pprados 4a62529
Fix
pprados 1c78325
Bugfix
pprados 1227dbb
Bugfix
pprados 90085e4
Bug fix
pprados 14264e9
Merge branch 'master' into pprados/02-pymupdf
pprados feacf69
Replace markdown-link to markdown-img
pprados c074729
Merge branch 'master' into pprados/02-pymupdf
pprados ee4784d
Update PyMuPDF
pprados 5d4a256
Add default value for properties
pprados 3d15d39
Fix one bug, update some typos, and style doc strings while reading
eyurtsev 0be6c88
Change the strategy for images_inner_format.
pprados d104ee7
Merge branch 'master' into pprados/02-pymupdf
pprados 023ba11
Fix PIL dependencies
pprados 23a73a9
Fix notebook
pprados a4587f0
Optimise tests
pprados d332958
Merge branch 'master' into pprados/02-pymupdf
pprados 4b37b34
Merge branch 'master' into pprados/02-pymupdf
pprados 2281d05
Merge branch 'master' into pprados/02-pymupdf
pprados 0da73f1
Remove Image.__init__
pprados d012d60
Merge remote-tracking branch 'origin/pprados/02-pymupdf' into pprados…
pprados 882c90d
Merge branch 'master' into pprados/02-pymupdf
pprados 74d3617
Remove Image.__init__
pprados 318f304
Merge branch 'master' into pprados/02-pymupdf
pprados 5ee7b9c
Merge branch 'master' into pprados/02-pymupdf
pprados File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
1,188 changes: 1,154 additions & 34 deletions
1,188
docs/docs/integrations/document_loaders/pymupdf.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
220 changes: 220 additions & 0 deletions
220
libs/community/langchain_community/document_loaders/parsers/images.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,220 @@ | ||
| import base64 | ||
| import io | ||
| import logging | ||
| from abc import abstractmethod | ||
| from typing import TYPE_CHECKING, Iterable, Iterator | ||
|
|
||
| import numpy | ||
| import numpy as np | ||
| from langchain_core.documents import Document | ||
| from langchain_core.language_models import BaseChatModel | ||
| from langchain_core.messages import HumanMessage | ||
|
|
||
| from langchain_community.document_loaders.base import BaseBlobParser | ||
| from langchain_community.document_loaders.blob_loaders import Blob | ||
|
|
||
| if TYPE_CHECKING: | ||
| from PIL.Image import Image | ||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
||
|
|
||
| class BaseImageBlobParser(BaseBlobParser): | ||
| """Abstract base class for parsing image blobs into text.""" | ||
|
|
||
| @abstractmethod | ||
| def _analyze_image(self, img: "Image") -> str: | ||
| """Abstract method to analyze an image and extract textual content. | ||
|
|
||
| Args: | ||
| img: The image to be analyzed. | ||
|
|
||
| Returns: | ||
| The extracted text content. | ||
| """ | ||
|
|
||
| def lazy_parse(self, blob: Blob) -> Iterator[Document]: | ||
| """Lazily parse a blob and yields Documents containing the parsed content. | ||
|
|
||
| Args: | ||
| blob (Blob): The blob to be parsed. | ||
|
|
||
| Yields: | ||
| Document: | ||
| A document containing the parsed content and metadata. | ||
| """ | ||
| try: | ||
| from PIL import Image as Img | ||
|
|
||
| with blob.as_bytes_io() as buf: | ||
| if blob.mimetype == "application/x-npy": | ||
| img = Img.fromarray(numpy.load(buf)) | ||
| else: | ||
| img = Img.open(buf) | ||
| content = self._analyze_image(img) | ||
| logger.debug("Image text: %s", content.replace("\n", "\\n")) | ||
| yield Document( | ||
| page_content=content, | ||
| metadata={**blob.metadata, **{"source": blob.source}}, | ||
| ) | ||
| except ImportError: | ||
| raise ImportError( | ||
| "`Pillow` package not found, please install it with " | ||
| "`pip install Pillow`" | ||
| ) | ||
|
|
||
|
|
||
| class RapidOCRBlobParser(BaseImageBlobParser): | ||
| """Parser for extracting text from images using the RapidOCR library. | ||
|
|
||
| Attributes: | ||
| ocr: | ||
| The RapidOCR instance for performing OCR. | ||
| """ | ||
|
|
||
| def __init__( | ||
| self, | ||
| ) -> None: | ||
| """ | ||
| Initializes the RapidOCRBlobParser. | ||
| """ | ||
| super().__init__() | ||
| self.ocr = None | ||
|
|
||
| def _analyze_image(self, img: "Image") -> str: | ||
| """ | ||
| Analyzes an image and extracts text using RapidOCR. | ||
|
|
||
| Args: | ||
| img (Image): | ||
| The image to be analyzed. | ||
|
|
||
| Returns: | ||
| str: | ||
| The extracted text content. | ||
| """ | ||
| if not self.ocr: | ||
| try: | ||
| from rapidocr_onnxruntime import RapidOCR | ||
|
|
||
| self.ocr = RapidOCR() | ||
| except ImportError: | ||
| raise ImportError( | ||
| "`rapidocr-onnxruntime` package not found, please install it with " | ||
| "`pip install rapidocr-onnxruntime`" | ||
| ) | ||
| ocr_result, _ = self.ocr(np.array(img)) # type: ignore | ||
| content = "" | ||
| if ocr_result: | ||
| content = ("\n".join([text[1] for text in ocr_result])).strip() | ||
| return content | ||
|
|
||
|
|
||
| class TesseractBlobParser(BaseImageBlobParser): | ||
| """Parse for extracting text from images using the Tesseract OCR library.""" | ||
|
|
||
| def __init__( | ||
| self, | ||
| *, | ||
| langs: Iterable[str] = ("eng",), | ||
| ): | ||
| """Initialize the TesseractBlobParser. | ||
|
|
||
| Args: | ||
| langs (list[str]): | ||
| The languages to use for OCR. | ||
| """ | ||
| super().__init__() | ||
| self.langs = list(langs) | ||
|
|
||
| def _analyze_image(self, img: "Image") -> str: | ||
| """Analyze an image and extracts text using Tesseract OCR. | ||
|
|
||
| Args: | ||
| img: The image to be analyzed. | ||
|
|
||
| Returns: | ||
| str: The extracted text content. | ||
| """ | ||
| try: | ||
| import pytesseract | ||
| except ImportError: | ||
| raise ImportError( | ||
| "`pytesseract` package not found, please install it with " | ||
| "`pip install pytesseract`" | ||
| ) | ||
| return pytesseract.image_to_string(img, lang="+".join(self.langs)).strip() | ||
|
|
||
|
|
||
| _PROMPT_IMAGES_TO_DESCRIPTION: str = ( | ||
| "You are an assistant tasked with summarizing images for retrieval. " | ||
| "1. These summaries will be embedded and used to retrieve the raw image. " | ||
| "Give a concise summary of the image that is well optimized for retrieval\n" | ||
| "2. extract all the text from the image. " | ||
| "Do not exclude any content from the page.\n" | ||
| "Format answer in markdown without explanatory text " | ||
| "and without markdown delimiter ``` at the beginning. " | ||
| ) | ||
|
|
||
|
|
||
| class LLMImageBlobParser(BaseImageBlobParser): | ||
| """Parser for analyzing images using a language model (LLM). | ||
|
|
||
| Attributes: | ||
| model (BaseChatModel): | ||
| The language model to use for analysis. | ||
| prompt (str): | ||
| The prompt to provide to the language model. | ||
| """ | ||
|
|
||
| def __init__( | ||
| self, | ||
| *, | ||
| model: BaseChatModel, | ||
| prompt: str = _PROMPT_IMAGES_TO_DESCRIPTION, | ||
| ): | ||
| """Initializes the LLMImageBlobParser. | ||
|
|
||
| Args: | ||
| model (BaseChatModel): | ||
| The language model to use for analysis. | ||
| prompt (str): | ||
| The prompt to provide to the language model. | ||
| """ | ||
| super().__init__() | ||
| self.model = model | ||
| self.prompt = prompt | ||
|
|
||
| def _analyze_image(self, img: "Image") -> str: | ||
| """Analyze an image using the provided language model. | ||
|
|
||
| Args: | ||
| img: The image to be analyzed. | ||
|
|
||
| Returns: | ||
| The extracted textual content. | ||
| """ | ||
| image_bytes = io.BytesIO() | ||
| img.save(image_bytes, format="PNG") | ||
| img_base64 = base64.b64encode(image_bytes.getvalue()).decode("utf-8") | ||
| msg = self.model.invoke( | ||
| [ | ||
| HumanMessage( | ||
| content=[ | ||
| { | ||
| "type": "text", | ||
| "text": self.prompt.format(format=format), | ||
| }, | ||
| { | ||
| "type": "image_url", | ||
| "image_url": { | ||
| "url": f"data:image/jpeg;base64,{img_base64}" | ||
| }, | ||
| }, | ||
| ] | ||
| ) | ||
| ] | ||
| ) | ||
| result = msg.content | ||
| assert isinstance(result, str) | ||
| return result | ||
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.