Skip to content

Commit

Permalink
Merge pull request #12 from enoch3712/7-add-pypdf-as-a-documentloader
Browse files Browse the repository at this point in the history
add pypdf as a documentloader
  • Loading branch information
enoch3712 authored May 30, 2024
2 parents 9fdf648 + f071390 commit 31dd376
Show file tree
Hide file tree
Showing 13 changed files with 361 additions and 25 deletions.
Binary file added examples/files/CV_Candidate.pdf
Binary file not shown.
Binary file added examples/files/Job_Offer.pdf
Binary file not shown.
157 changes: 157 additions & 0 deletions examples/resume_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
import json
import os
from typing import List, Optional

from dotenv import load_dotenv
from pydantic import Field
import yaml

from extract_thinker import Extractor, Contract, DocumentLoaderPyPdf
from litellm import Router

from extract_thinker.llm import LLM


def json_to_yaml(json_dict):
# Check if json_dict is a dictionary
if not isinstance(json_dict, dict):
raise ValueError("json_dict must be a dictionary")

# Convert the Python dictionary to YAML
yaml_str = yaml.dump(json_dict)

return yaml_str


class RoleContract(Contract):
company_name: str = Field("Company name")
years_of_experience: int = Field("Years of experience required. If not mention, calculate with start date and end date")
is_remote: bool = Field("Is the role remote?")
country: str = Field("Country of the role")
city: Optional[str] = Field("City of the role")
list_of_skills: List[str] = Field("""
list of strings, e.g ["5 years experience", "3 years in React", "Typescript"]
Make the lists of skills to be a yes/no list, so it can be used in the LLM model as a list of true/false
""")


class ResumeContract(Contract):
name: str = Field("First and Last Name")
age: Optional[str] = Field("Age with format DD/MM/YYYY. Empty if not available")
email: str = Field("Email address")
phone: Optional[str] = Field("Phone number")
address: Optional[str] = Field("Address")
city: Optional[str] = Field("City")
total_experience: int = Field("Total experience in years")
can_go_to_office: Optional[bool] = Field("Can go to office. If city/location is not provider, is false. If is the same city, is true")
list_of_skills: List[bool] = Field("Takes the list of skills and returns a list of true/false, if the candidate has that skill. E.g. ['Python', 'JavaScript', 'React', 'Node.js'] -> [True, True, False, True]")


class Person(Contract):
name: str = Field("First and Last Name")
list_of_skills: List[str]

load_dotenv()
cwd = os.getcwd()


def config_router():
rpm = 5000 # Rate limit in requests per minute

model_list = [
{
"model_name": "Meta-Llama-3-8B-Instruct",
"litellm_params": {
"model": "deepinfra/meta-llama/Meta-Llama-3-8B-Instruct",
"api_key": os.getenv("DEEPINFRA_API_KEY"),
"rpm": rpm,
},
},
{
"model_name": "Mistral-7B-Instruct-v0.2",
"litellm_params": {
"model": "deepinfra/mistralai/Mistral-7B-Instruct-v0.2",
"api_key": os.getenv("DEEPINFRA_API_KEY"),
"rpm": rpm,
}
},
{
"model_name": "groq-llama3-8b-8192",
"litellm_params": {
"model": "groq/llama3-8b-8192",
"api_key": os.getenv("GROQ_API_KEY"),
"rpm": rpm,
}
},
]

# Adding fallback models
fallback_models = [
{
"model_name": "claude-3-haiku-20240307",
"litellm_params": {
"model": "claude-3-haiku-20240307",
"api_key": os.getenv("CLAUDE_API_KEY"),
}
},
{
"model_name": "azure-deployment",
"litellm_params": {
"model": "azure/<your-deployment-name>",
"api_base": os.getenv("AZURE_API_BASE"),
"api_key": os.getenv("AZURE_API_KEY"),
"rpm": 1440,
}
}
]

# Combine the lists
model_list.extend(fallback_models)

# Define the router configuration
router = Router(
model_list=model_list,
default_fallbacks=["claude-3-haiku-20240307", "azure/<your-deployment-name>"],
context_window_fallbacks=[
{"Meta-Llama-3-8B-Instruct": ["claude-3-haiku-20240307"]},
{"groq-llama3-8b-8192": ["claude-3-haiku-20240307"]},
{"Mistral-7B-Instruct-v0.2": ["claude-3-haiku-20240307"]}
],
set_verbose=True
)

return router


job_role_path = os.path.join(cwd, "examples", "files", "Job_Offer.pdf")

extractor_job_role = Extractor()

extractor_job_role.load_document_loader(
DocumentLoaderPyPdf()
)

extractor_job_role.load_llm("gpt-4o")
role_result = extractor_job_role.extract(job_role_path, RoleContract)

print(role_result.json())

extractor_candidate = Extractor()
extractor_candidate.load_document_loader(
DocumentLoaderPyPdf()
)

llm = LLM("groq/llama3-8b-8192") # default model
#llm.load_router(config_router()) # load the router

extractor_candidate.load_llm(llm)

resume_content_path = os.path.join(cwd, "examples", "files", "CV_Candidate.pdf")

job_role_content = "This is the job cotent. to be mapped: \n" + json_to_yaml(json.loads(role_result.json()))

result = extractor_candidate.extract(resume_content_path,
ResumeContract,
content=job_role_content)

print(result.json())
2 changes: 2 additions & 0 deletions extract_thinker/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from .document_loader.cached_document_loader import CachedDocumentLoader
from .document_loader.document_loader_tesseract import DocumentLoaderTesseract
from .document_loader.document_loader_spreadsheet import DocumentLoaderSpreadSheet
from .document_loader.document_loader_pypdf import DocumentLoaderPyPdf
from .document_loader.document_loader_text import DocumentLoaderText
from .models import classification, classification_response
from .process import Process
Expand All @@ -17,6 +18,7 @@
'DocumentLoader',
'CachedDocumentLoader',
'DocumentLoaderTesseract',
'DocumentLoaderPyPdf',
'DocumentLoaderText',
'classification',
'classification_response',
Expand Down
54 changes: 54 additions & 0 deletions extract_thinker/document_loader/document_loader_llm_image.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from abc import ABC
from io import BytesIO
from PIL import Image
from extract_thinker.document_loader.cached_document_loader import CachedDocumentLoader
from extract_thinker.utils import extract_json


class DocumentLoaderLLMImage(CachedDocumentLoader, ABC):
def __init__(self, content=None, cache_ttl=300, llm=None):
super().__init__(content, cache_ttl)
self.llm = llm

def extract_image_content(self, image_stream: BytesIO) -> str:
"""
Extracts text or data from an image using an LLM.
The actual implementation uses an LLM to process the image content.
"""
# Load the image from the stream
image = Image.open(image_stream)

# Encode the image to base64
base64_image = self.encode_image(image)

# Use the LLM to extract the content from the image
resp = self.llm.completion(
model="claude-3-sonnet-20240229",
messages=[
{
"role": "system",
"content": 'You are a worldclass Image data extractor. You receive an image and extract useful information from it. You output a JSON with the extracted information.',
},
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": "data:image/jpeg;base64," + base64_image
},
},
{"type": "text", "text": "###JSON Output\n"},
],
},
],
)

# Extract the JSON text from the response
jsonText = resp.choices[0].message.content

# Extract the JSON from the text
jsonText = extract_json(jsonText)

# Return the extracted content
return jsonText
41 changes: 41 additions & 0 deletions extract_thinker/document_loader/document_loader_pypdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import io
from typing import Any, Dict, List, Union
from PyPDF2 import PdfReader
from extract_thinker.document_loader.document_loader_llm_image import DocumentLoaderLLMImage


class DocumentLoaderPyPdf(DocumentLoaderLLMImage):
def __init__(self, content: Any = None, cache_ttl: int = 300):
super().__init__(content, cache_ttl)

def load_content_from_file(self, file_path: str) -> Union[str, Dict[str, Any]]:
reader = PdfReader(file_path)
return self.extract_data_from_pdf(reader)

def load_content_from_stream(self, stream: io.BytesIO) -> Union[str, Dict[str, Any]]:
reader = PdfReader(stream)
return self.extract_data_from_pdf(reader)

def load_content_from_file_list(self, file_paths: List[str]) -> List[Any]:
return [self.load_content_from_file(file_path) for file_path in file_paths]

def load_content_from_stream_list(self, streams: List[io.BytesIO]) -> List[Any]:
return [self.load_content_from_stream(stream) for stream in streams]

def extract_data_from_pdf(self, reader: PdfReader) -> Union[str, Dict[str, Any]]:
document_data = {
"text": []
}

for page in reader.pages:
# Extract text and split by newline characters
page_text = page.extract_text()
document_data["text"].extend(page_text.split('\n'))

# Skip image extraction for now. TODO
# for img_index, image in enumerate(page.images):
# image_data = self.extract_image_content(io.BytesIO(image["data"]))
# if image_data:
# document_data["images"].append(image_data)

return document_data
43 changes: 32 additions & 11 deletions extract_thinker/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
from extract_thinker.document_loader.loader_interceptor import LoaderInterceptor
from extract_thinker.document_loader.llm_interceptor import LlmInterceptor

from extract_thinker.utils import get_file_extension
from extract_thinker.utils import get_file_extension, encode_image
import yaml


SUPPORTED_IMAGE_FORMATS = ["jpeg", "png", "bmp", "tiff"]
Expand All @@ -30,6 +31,7 @@ def __init__(
self.document_loaders_by_file_type: Dict[str, DocumentLoader] = {}
self.loader_interceptors: List[LoaderInterceptor] = []
self.llm_interceptors: List[LlmInterceptor] = []
self.extra_content: Optional[str] = None

def add_interceptor(
self, interceptor: Union[LoaderInterceptor, LlmInterceptor]
Expand All @@ -55,10 +57,17 @@ def get_document_loader_for_file(self, file: str) -> DocumentLoader:
def load_document_loader(self, document_loader: DocumentLoader) -> None:
self.document_loader = document_loader

def load_llm(self, model: str) -> None:
self.llm = LLM(model)
def load_llm(self, model: Optional[str] = None) -> None:
if isinstance(model, LLM):
self.llm = model
elif model is not None:
self.llm = LLM(model)
else:
raise ValueError("Either a model string or an LLM object must be provided.")

def extract(self, source: Union[str, IO, list], response_model: type[BaseModel], vision: bool = False, content: Optional[str] = None) -> Any:
self.extra_content = content

def extract(self, source: Union[str, IO, list], response_model: type[BaseModel], vision: bool = False) -> str:
if not issubclass(response_model, BaseModel):
raise ValueError("response_model must be a subclass of Pydantic's BaseModel.")

Expand All @@ -71,7 +80,7 @@ def extract(self, source: Union[str, IO, list], response_model: type[BaseModel],
else:
raise ValueError("Source must be a file path, a stream, or a list of dictionaries")

async def extract_async(self, source: Union[str, IO, list], response_model: type[BaseModel], vision: bool = False) -> str:
async def extract_async(self, source: Union[str, IO, list], response_model: type[BaseModel], vision: bool = False) -> Any:
return await asyncio.to_thread(self.extract, source, response_model, vision)

def extract_from_list(self, data: List[Dict[Any, Any]], response_model: type[BaseModel], vision: bool) -> str:
Expand Down Expand Up @@ -162,9 +171,13 @@ def classify(self, input: Union[str, IO], classifications: List[Classification])
async def classify_async(self, input: Union[str, IO], classifications: List[Classification]):
return await asyncio.to_thread(self.classify, input, classifications)

def _extract(
self, content, file_or_stream, response_model, vision=False, is_stream=False
):
def _extract(self,
content,
file_or_stream,
response_model,
vision=False,
is_stream=False
):
# call all the llm interceptors before calling the llm
for interceptor in self.llm_interceptors:
interceptor.intercept(self.llm)
Expand All @@ -177,8 +190,18 @@ def _extract(
},
]

if self.extra_content is not None:
if isinstance(self.extra_content, dict):
self.extra_content = yaml.dump(self.extra_content)
messages.append({"role": "user", "content": "##Extra Content\n\n" + self.extra_content})

if content is not None:
if isinstance(content, dict):
content = yaml.dump(content)
messages.append({"role": "user", "content": "##Content\n\n" + content})

if vision:
base64_encoded_image = self._encode_image_to_base64(
base64_encoded_image = encode_image(
file_or_stream, is_stream
)

Expand All @@ -196,8 +219,6 @@ def _extract(
],
}
]
else:
messages.append({"role": "user", "content": "##Content\n\n" + content})

response = self.llm.request(messages, response_model)
return response
Expand Down
Loading

0 comments on commit 31dd376

Please sign in to comment.