-
Notifications
You must be signed in to change notification settings - Fork 98
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #12 from enoch3712/7-add-pypdf-as-a-documentloader
add pypdf as a documentloader
- Loading branch information
Showing
13 changed files
with
361 additions
and
25 deletions.
There are no files selected for viewing
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,157 @@ | ||
import json | ||
import os | ||
from typing import List, Optional | ||
|
||
from dotenv import load_dotenv | ||
from pydantic import Field | ||
import yaml | ||
|
||
from extract_thinker import Extractor, Contract, DocumentLoaderPyPdf | ||
from litellm import Router | ||
|
||
from extract_thinker.llm import LLM | ||
|
||
|
||
def json_to_yaml(json_dict): | ||
# Check if json_dict is a dictionary | ||
if not isinstance(json_dict, dict): | ||
raise ValueError("json_dict must be a dictionary") | ||
|
||
# Convert the Python dictionary to YAML | ||
yaml_str = yaml.dump(json_dict) | ||
|
||
return yaml_str | ||
|
||
|
||
class RoleContract(Contract): | ||
company_name: str = Field("Company name") | ||
years_of_experience: int = Field("Years of experience required. If not mention, calculate with start date and end date") | ||
is_remote: bool = Field("Is the role remote?") | ||
country: str = Field("Country of the role") | ||
city: Optional[str] = Field("City of the role") | ||
list_of_skills: List[str] = Field(""" | ||
list of strings, e.g ["5 years experience", "3 years in React", "Typescript"] | ||
Make the lists of skills to be a yes/no list, so it can be used in the LLM model as a list of true/false | ||
""") | ||
|
||
|
||
class ResumeContract(Contract): | ||
name: str = Field("First and Last Name") | ||
age: Optional[str] = Field("Age with format DD/MM/YYYY. Empty if not available") | ||
email: str = Field("Email address") | ||
phone: Optional[str] = Field("Phone number") | ||
address: Optional[str] = Field("Address") | ||
city: Optional[str] = Field("City") | ||
total_experience: int = Field("Total experience in years") | ||
can_go_to_office: Optional[bool] = Field("Can go to office. If city/location is not provider, is false. If is the same city, is true") | ||
list_of_skills: List[bool] = Field("Takes the list of skills and returns a list of true/false, if the candidate has that skill. E.g. ['Python', 'JavaScript', 'React', 'Node.js'] -> [True, True, False, True]") | ||
|
||
|
||
class Person(Contract): | ||
name: str = Field("First and Last Name") | ||
list_of_skills: List[str] | ||
|
||
load_dotenv() | ||
cwd = os.getcwd() | ||
|
||
|
||
def config_router(): | ||
rpm = 5000 # Rate limit in requests per minute | ||
|
||
model_list = [ | ||
{ | ||
"model_name": "Meta-Llama-3-8B-Instruct", | ||
"litellm_params": { | ||
"model": "deepinfra/meta-llama/Meta-Llama-3-8B-Instruct", | ||
"api_key": os.getenv("DEEPINFRA_API_KEY"), | ||
"rpm": rpm, | ||
}, | ||
}, | ||
{ | ||
"model_name": "Mistral-7B-Instruct-v0.2", | ||
"litellm_params": { | ||
"model": "deepinfra/mistralai/Mistral-7B-Instruct-v0.2", | ||
"api_key": os.getenv("DEEPINFRA_API_KEY"), | ||
"rpm": rpm, | ||
} | ||
}, | ||
{ | ||
"model_name": "groq-llama3-8b-8192", | ||
"litellm_params": { | ||
"model": "groq/llama3-8b-8192", | ||
"api_key": os.getenv("GROQ_API_KEY"), | ||
"rpm": rpm, | ||
} | ||
}, | ||
] | ||
|
||
# Adding fallback models | ||
fallback_models = [ | ||
{ | ||
"model_name": "claude-3-haiku-20240307", | ||
"litellm_params": { | ||
"model": "claude-3-haiku-20240307", | ||
"api_key": os.getenv("CLAUDE_API_KEY"), | ||
} | ||
}, | ||
{ | ||
"model_name": "azure-deployment", | ||
"litellm_params": { | ||
"model": "azure/<your-deployment-name>", | ||
"api_base": os.getenv("AZURE_API_BASE"), | ||
"api_key": os.getenv("AZURE_API_KEY"), | ||
"rpm": 1440, | ||
} | ||
} | ||
] | ||
|
||
# Combine the lists | ||
model_list.extend(fallback_models) | ||
|
||
# Define the router configuration | ||
router = Router( | ||
model_list=model_list, | ||
default_fallbacks=["claude-3-haiku-20240307", "azure/<your-deployment-name>"], | ||
context_window_fallbacks=[ | ||
{"Meta-Llama-3-8B-Instruct": ["claude-3-haiku-20240307"]}, | ||
{"groq-llama3-8b-8192": ["claude-3-haiku-20240307"]}, | ||
{"Mistral-7B-Instruct-v0.2": ["claude-3-haiku-20240307"]} | ||
], | ||
set_verbose=True | ||
) | ||
|
||
return router | ||
|
||
|
||
job_role_path = os.path.join(cwd, "examples", "files", "Job_Offer.pdf") | ||
|
||
extractor_job_role = Extractor() | ||
|
||
extractor_job_role.load_document_loader( | ||
DocumentLoaderPyPdf() | ||
) | ||
|
||
extractor_job_role.load_llm("gpt-4o") | ||
role_result = extractor_job_role.extract(job_role_path, RoleContract) | ||
|
||
print(role_result.json()) | ||
|
||
extractor_candidate = Extractor() | ||
extractor_candidate.load_document_loader( | ||
DocumentLoaderPyPdf() | ||
) | ||
|
||
llm = LLM("groq/llama3-8b-8192") # default model | ||
#llm.load_router(config_router()) # load the router | ||
|
||
extractor_candidate.load_llm(llm) | ||
|
||
resume_content_path = os.path.join(cwd, "examples", "files", "CV_Candidate.pdf") | ||
|
||
job_role_content = "This is the job cotent. to be mapped: \n" + json_to_yaml(json.loads(role_result.json())) | ||
|
||
result = extractor_candidate.extract(resume_content_path, | ||
ResumeContract, | ||
content=job_role_content) | ||
|
||
print(result.json()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
54 changes: 54 additions & 0 deletions
54
extract_thinker/document_loader/document_loader_llm_image.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
from abc import ABC | ||
from io import BytesIO | ||
from PIL import Image | ||
from extract_thinker.document_loader.cached_document_loader import CachedDocumentLoader | ||
from extract_thinker.utils import extract_json | ||
|
||
|
||
class DocumentLoaderLLMImage(CachedDocumentLoader, ABC): | ||
def __init__(self, content=None, cache_ttl=300, llm=None): | ||
super().__init__(content, cache_ttl) | ||
self.llm = llm | ||
|
||
def extract_image_content(self, image_stream: BytesIO) -> str: | ||
""" | ||
Extracts text or data from an image using an LLM. | ||
The actual implementation uses an LLM to process the image content. | ||
""" | ||
# Load the image from the stream | ||
image = Image.open(image_stream) | ||
|
||
# Encode the image to base64 | ||
base64_image = self.encode_image(image) | ||
|
||
# Use the LLM to extract the content from the image | ||
resp = self.llm.completion( | ||
model="claude-3-sonnet-20240229", | ||
messages=[ | ||
{ | ||
"role": "system", | ||
"content": 'You are a worldclass Image data extractor. You receive an image and extract useful information from it. You output a JSON with the extracted information.', | ||
}, | ||
{ | ||
"role": "user", | ||
"content": [ | ||
{ | ||
"type": "image_url", | ||
"image_url": { | ||
"url": "data:image/jpeg;base64," + base64_image | ||
}, | ||
}, | ||
{"type": "text", "text": "###JSON Output\n"}, | ||
], | ||
}, | ||
], | ||
) | ||
|
||
# Extract the JSON text from the response | ||
jsonText = resp.choices[0].message.content | ||
|
||
# Extract the JSON from the text | ||
jsonText = extract_json(jsonText) | ||
|
||
# Return the extracted content | ||
return jsonText |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
import io | ||
from typing import Any, Dict, List, Union | ||
from PyPDF2 import PdfReader | ||
from extract_thinker.document_loader.document_loader_llm_image import DocumentLoaderLLMImage | ||
|
||
|
||
class DocumentLoaderPyPdf(DocumentLoaderLLMImage): | ||
def __init__(self, content: Any = None, cache_ttl: int = 300): | ||
super().__init__(content, cache_ttl) | ||
|
||
def load_content_from_file(self, file_path: str) -> Union[str, Dict[str, Any]]: | ||
reader = PdfReader(file_path) | ||
return self.extract_data_from_pdf(reader) | ||
|
||
def load_content_from_stream(self, stream: io.BytesIO) -> Union[str, Dict[str, Any]]: | ||
reader = PdfReader(stream) | ||
return self.extract_data_from_pdf(reader) | ||
|
||
def load_content_from_file_list(self, file_paths: List[str]) -> List[Any]: | ||
return [self.load_content_from_file(file_path) for file_path in file_paths] | ||
|
||
def load_content_from_stream_list(self, streams: List[io.BytesIO]) -> List[Any]: | ||
return [self.load_content_from_stream(stream) for stream in streams] | ||
|
||
def extract_data_from_pdf(self, reader: PdfReader) -> Union[str, Dict[str, Any]]: | ||
document_data = { | ||
"text": [] | ||
} | ||
|
||
for page in reader.pages: | ||
# Extract text and split by newline characters | ||
page_text = page.extract_text() | ||
document_data["text"].extend(page_text.split('\n')) | ||
|
||
# Skip image extraction for now. TODO | ||
# for img_index, image in enumerate(page.images): | ||
# image_data = self.extract_image_content(io.BytesIO(image["data"])) | ||
# if image_data: | ||
# document_data["images"].append(image_data) | ||
|
||
return document_data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.