Merge pull request #12 from enoch3712/7-add-pypdf-as-a-documentloader

add pypdf as a documentloader
enoch3712 · May 30, 2024 · 31dd376 · 31dd376
2 parents 9fdf648 + f071390
commit 31dd376
Show file tree

Hide file tree

Showing 13 changed files with 361 additions and 25 deletions.
diff --git a/examples/files/CV_Candidate.pdf b/examples/files/CV_Candidate.pdf
diff --git a/examples/files/Job_Offer.pdf b/examples/files/Job_Offer.pdf
diff --git a/examples/resume_processor.py b/examples/resume_processor.py
@@ -0,0 +1,157 @@
+import json
+import os
+from typing import List, Optional
+
+from dotenv import load_dotenv
+from pydantic import Field
+import yaml
+
+from extract_thinker import Extractor, Contract, DocumentLoaderPyPdf
+from litellm import Router
+
+from extract_thinker.llm import LLM
+
+
+def json_to_yaml(json_dict):
+    # Check if json_dict is a dictionary
+    if not isinstance(json_dict, dict):
+        raise ValueError("json_dict must be a dictionary")
+
+    # Convert the Python dictionary to YAML
+    yaml_str = yaml.dump(json_dict)
+
+    return yaml_str
+
+
+class RoleContract(Contract):
+    company_name: str = Field("Company name")
+    years_of_experience: int = Field("Years of experience required. If not mention, calculate with start date and end date")
+    is_remote: bool = Field("Is the role remote?")
+    country: str = Field("Country of the role")
+    city: Optional[str] = Field("City of the role")
+    list_of_skills: List[str] = Field("""
+                                          list of strings, e.g ["5 years experience", "3 years in React", "Typescript"]
+                                          Make the lists of skills to be a yes/no list, so it can be used in the LLM model as a list of true/false
+                                          """)
+
+
+class ResumeContract(Contract):
+    name: str = Field("First and Last Name")
+    age: Optional[str] = Field("Age with format DD/MM/YYYY. Empty if not available")
+    email: str = Field("Email address")
+    phone: Optional[str] = Field("Phone number")
+    address: Optional[str] = Field("Address")
+    city: Optional[str] = Field("City")
+    total_experience: int = Field("Total experience in years")
+    can_go_to_office: Optional[bool] = Field("Can go to office. If city/location is not provider, is false. If is the same city, is true")
+    list_of_skills: List[bool] = Field("Takes the list of skills and returns a list of true/false, if the candidate has that skill. E.g. ['Python', 'JavaScript', 'React', 'Node.js'] -> [True, True, False, True]")
+
+
+class Person(Contract):
+    name: str = Field("First and Last Name")
+    list_of_skills: List[str]
+
+load_dotenv()
+cwd = os.getcwd()
+
+
+def config_router():
+    rpm = 5000  # Rate limit in requests per minute
+
+    model_list = [
+        {
+            "model_name": "Meta-Llama-3-8B-Instruct",
+            "litellm_params": {
+                "model": "deepinfra/meta-llama/Meta-Llama-3-8B-Instruct",
+                "api_key": os.getenv("DEEPINFRA_API_KEY"),
+                "rpm": rpm,
+            },
+        },
+        {
+            "model_name": "Mistral-7B-Instruct-v0.2",
+            "litellm_params": {
+                "model": "deepinfra/mistralai/Mistral-7B-Instruct-v0.2",
+                "api_key": os.getenv("DEEPINFRA_API_KEY"),
+                "rpm": rpm,
+            }
+        },
+        {
+            "model_name": "groq-llama3-8b-8192",
+            "litellm_params": {
+                "model": "groq/llama3-8b-8192",
+                "api_key": os.getenv("GROQ_API_KEY"),
+                "rpm": rpm,
+            }
+        },
+    ]
+
+    # Adding fallback models
+    fallback_models = [
+        {
+            "model_name": "claude-3-haiku-20240307",
+            "litellm_params": {
+                "model": "claude-3-haiku-20240307",
+                "api_key": os.getenv("CLAUDE_API_KEY"),
+            }
+        },
+        {
+            "model_name": "azure-deployment",
+            "litellm_params": {
+                "model": "azure/<your-deployment-name>",
+                "api_base": os.getenv("AZURE_API_BASE"),
+                "api_key": os.getenv("AZURE_API_KEY"),
+                "rpm": 1440,
+            }
+        }
+    ]
+
+    # Combine the lists
+    model_list.extend(fallback_models)
+
+    # Define the router configuration
+    router = Router(
+        model_list=model_list,
+        default_fallbacks=["claude-3-haiku-20240307", "azure/<your-deployment-name>"],
+        context_window_fallbacks=[
+            {"Meta-Llama-3-8B-Instruct": ["claude-3-haiku-20240307"]},
+            {"groq-llama3-8b-8192": ["claude-3-haiku-20240307"]},
+            {"Mistral-7B-Instruct-v0.2": ["claude-3-haiku-20240307"]}
+        ],
+        set_verbose=True
+    )
+
+    return router
+
+
+job_role_path = os.path.join(cwd, "examples", "files", "Job_Offer.pdf")
+
+extractor_job_role = Extractor()
+
+extractor_job_role.load_document_loader(
+    DocumentLoaderPyPdf()
+)
+
+extractor_job_role.load_llm("gpt-4o")
+role_result = extractor_job_role.extract(job_role_path, RoleContract)
+
+print(role_result.json())
+
+extractor_candidate = Extractor()
+extractor_candidate.load_document_loader(
+    DocumentLoaderPyPdf()
+)
+
+llm = LLM("groq/llama3-8b-8192")  # default model
+#llm.load_router(config_router())  # load the router
+
+extractor_candidate.load_llm(llm)
+
+resume_content_path = os.path.join(cwd, "examples", "files", "CV_Candidate.pdf")
+
+job_role_content = "This is the job cotent. to be mapped: \n" + json_to_yaml(json.loads(role_result.json()))
+
+result = extractor_candidate.extract(resume_content_path,
+                                     ResumeContract,
+                                     content=job_role_content)
+
+print(result.json())
diff --git a/extract_thinker/__init__.py b/extract_thinker/__init__.py
@@ -3,6 +3,7 @@
 from .document_loader.cached_document_loader import CachedDocumentLoader
 from .document_loader.document_loader_tesseract import DocumentLoaderTesseract
 from .document_loader.document_loader_spreadsheet import DocumentLoaderSpreadSheet
+from .document_loader.document_loader_pypdf import DocumentLoaderPyPdf
 from .document_loader.document_loader_text import DocumentLoaderText
 from .models import classification, classification_response
 from .process import Process
@@ -17,6 +18,7 @@
     'DocumentLoader',
     'CachedDocumentLoader',
     'DocumentLoaderTesseract',
+    'DocumentLoaderPyPdf',
     'DocumentLoaderText',
     'classification',
     'classification_response',

diff --git a/extract_thinker/document_loader/document_loader_llm_image.py b/extract_thinker/document_loader/document_loader_llm_image.py
@@ -0,0 +1,54 @@
+from abc import ABC
+from io import BytesIO
+from PIL import Image
+from extract_thinker.document_loader.cached_document_loader import CachedDocumentLoader
+from extract_thinker.utils import extract_json
+
+
+class DocumentLoaderLLMImage(CachedDocumentLoader, ABC):
+    def __init__(self, content=None, cache_ttl=300, llm=None):
+        super().__init__(content, cache_ttl)
+        self.llm = llm
+
+    def extract_image_content(self, image_stream: BytesIO) -> str:
+        """
+        Extracts text or data from an image using an LLM.
+        The actual implementation uses an LLM to process the image content.
+        """
+        # Load the image from the stream
+        image = Image.open(image_stream)
+
+        # Encode the image to base64
+        base64_image = self.encode_image(image)
+
+        # Use the LLM to extract the content from the image
+        resp = self.llm.completion(
+            model="claude-3-sonnet-20240229",
+            messages=[
+                {
+                    "role": "system",
+                    "content": 'You are a worldclass Image data extractor. You receive an image and extract useful information from it. You output a JSON with the extracted information.',
+                },
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": "data:image/jpeg;base64," + base64_image
+                            },
+                        },
+                        {"type": "text", "text": "###JSON Output\n"},
+                    ],
+                },
+            ],
+        )
+
+        # Extract the JSON text from the response
+        jsonText = resp.choices[0].message.content
+
+        # Extract the JSON from the text
+        jsonText = extract_json(jsonText)
+
+        # Return the extracted content
+        return jsonText
diff --git a/extract_thinker/document_loader/document_loader_pypdf.py b/extract_thinker/document_loader/document_loader_pypdf.py
@@ -0,0 +1,41 @@
+import io
+from typing import Any, Dict, List, Union
+from PyPDF2 import PdfReader
+from extract_thinker.document_loader.document_loader_llm_image import DocumentLoaderLLMImage
+
+
+class DocumentLoaderPyPdf(DocumentLoaderLLMImage):
+    def __init__(self, content: Any = None, cache_ttl: int = 300):
+        super().__init__(content, cache_ttl)
+
+    def load_content_from_file(self, file_path: str) -> Union[str, Dict[str, Any]]:
+        reader = PdfReader(file_path)
+        return self.extract_data_from_pdf(reader)
+
+    def load_content_from_stream(self, stream: io.BytesIO) -> Union[str, Dict[str, Any]]:
+        reader = PdfReader(stream)
+        return self.extract_data_from_pdf(reader)
+
+    def load_content_from_file_list(self, file_paths: List[str]) -> List[Any]:
+        return [self.load_content_from_file(file_path) for file_path in file_paths]
+
+    def load_content_from_stream_list(self, streams: List[io.BytesIO]) -> List[Any]:
+        return [self.load_content_from_stream(stream) for stream in streams]
+
+    def extract_data_from_pdf(self, reader: PdfReader) -> Union[str, Dict[str, Any]]:
+        document_data = {
+            "text": []
+        }
+
+        for page in reader.pages:
+            # Extract text and split by newline characters
+            page_text = page.extract_text()
+            document_data["text"].extend(page_text.split('\n'))
+
+        # Skip image extraction for now. TODO
+        # for img_index, image in enumerate(page.images):
+        #     image_data = self.extract_image_content(io.BytesIO(image["data"]))
+        #     if image_data:
+        #         document_data["images"].append(image_data)
+
+        return document_data
diff --git a/extract_thinker/extractor.py b/extract_thinker/extractor.py
@@ -13,7 +13,8 @@
 from extract_thinker.document_loader.loader_interceptor import LoaderInterceptor
 from extract_thinker.document_loader.llm_interceptor import LlmInterceptor
 
-from extract_thinker.utils import get_file_extension
+from extract_thinker.utils import get_file_extension, encode_image
+import yaml
 
 
 SUPPORTED_IMAGE_FORMATS = ["jpeg", "png", "bmp", "tiff"]
@@ -30,6 +31,7 @@ def __init__(
         self.document_loaders_by_file_type: Dict[str, DocumentLoader] = {}
         self.loader_interceptors: List[LoaderInterceptor] = []
         self.llm_interceptors: List[LlmInterceptor] = []
+        self.extra_content: Optional[str] = None
 
     def add_interceptor(
         self, interceptor: Union[LoaderInterceptor, LlmInterceptor]
@@ -55,10 +57,17 @@ def get_document_loader_for_file(self, file: str) -> DocumentLoader:
     def load_document_loader(self, document_loader: DocumentLoader) -> None:
         self.document_loader = document_loader
 
-    def load_llm(self, model: str) -> None:
-        self.llm = LLM(model)
+    def load_llm(self, model: Optional[str] = None) -> None:
+        if isinstance(model, LLM):
+            self.llm = model
+        elif model is not None:
+            self.llm = LLM(model)
+        else:
+            raise ValueError("Either a model string or an LLM object must be provided.")
+
+    def extract(self, source: Union[str, IO, list], response_model: type[BaseModel], vision: bool = False, content: Optional[str] = None) -> Any:
+        self.extra_content = content
 
-    def extract(self, source: Union[str, IO, list], response_model: type[BaseModel], vision: bool = False) -> str:
         if not issubclass(response_model, BaseModel):
             raise ValueError("response_model must be a subclass of Pydantic's BaseModel.")
 
@@ -71,7 +80,7 @@ def extract(self, source: Union[str, IO, list], response_model: type[BaseModel],
         else:
             raise ValueError("Source must be a file path, a stream, or a list of dictionaries")
 
-    async def extract_async(self, source: Union[str, IO, list], response_model: type[BaseModel], vision: bool = False) -> str:
+    async def extract_async(self, source: Union[str, IO, list], response_model: type[BaseModel], vision: bool = False) -> Any:
         return await asyncio.to_thread(self.extract, source, response_model, vision)
 
     def extract_from_list(self, data: List[Dict[Any, Any]], response_model: type[BaseModel], vision: bool) -> str:
@@ -162,9 +171,13 @@ def classify(self, input: Union[str, IO], classifications: List[Classification])
     async def classify_async(self, input: Union[str, IO], classifications: List[Classification]):
         return await asyncio.to_thread(self.classify, input, classifications)
 
-    def _extract(
-        self, content, file_or_stream, response_model, vision=False, is_stream=False
-    ):
+    def _extract(self,
+                 content,
+                 file_or_stream,
+                 response_model,
+                 vision=False,
+                 is_stream=False
+                 ):
         # call all the llm interceptors before calling the llm
         for interceptor in self.llm_interceptors:
             interceptor.intercept(self.llm)
@@ -177,8 +190,18 @@ def _extract(
             },
         ]
 
+        if self.extra_content is not None:
+            if isinstance(self.extra_content, dict):
+                self.extra_content = yaml.dump(self.extra_content)
+            messages.append({"role": "user", "content": "##Extra Content\n\n" + self.extra_content})
+
+        if content is not None:
+            if isinstance(content, dict):
+                content = yaml.dump(content)
+            messages.append({"role": "user", "content": "##Content\n\n" + content})
+
         if vision:
-            base64_encoded_image = self._encode_image_to_base64(
+            base64_encoded_image = encode_image(
                 file_or_stream, is_stream
             )
 
@@ -196,8 +219,6 @@ def _extract(
                     ],
                 }
             ]
-        else:
-            messages.append({"role": "user", "content": "##Content\n\n" + content})
 
         response = self.llm.request(messages, response_model)
         return response