-
Notifications
You must be signed in to change notification settings - Fork 106
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
ExtractThinker first code. Basic tests working
- Loading branch information
Showing
53 changed files
with
911 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
[flake8] | ||
ignore = E501 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
name: Python package workflow | ||
|
||
on: | ||
push: | ||
branches: | ||
- main | ||
pull_request: | ||
branches: | ||
- main | ||
|
||
jobs: | ||
build-and-test: | ||
runs-on: ubuntu-latest | ||
|
||
steps: | ||
- uses: actions/checkout@v3 | ||
- name: Set up Python | ||
uses: actions/setup-python@v3 | ||
with: | ||
python-version: '3.8' | ||
|
||
- name: Install dependencies | ||
run: | | ||
pip install poetry | ||
poetry install | ||
- name: Run tests | ||
run: poetry run pytest | ||
|
||
- name: Build package | ||
run: poetry build |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
repos: | ||
- repo: https://github.com/astral-sh/ruff-pre-commit | ||
rev: v0.1.7 # Ruff version | ||
hooks: | ||
- id: ruff # Run the linter. | ||
name: Run Linter Check (Ruff) | ||
args: [ --fix ] | ||
files: ^(extractthinker|tests|examples)/ | ||
- id: ruff-format # Run the formatter. | ||
name: Run Formatter (Ruff) | ||
- repo: local | ||
hooks: | ||
- id: ci_type_mypy | ||
name: Run Type Check (Mypy) | ||
entry: > | ||
bash -c 'set -o pipefail; | ||
export CUSTOM_PACKAGES="extractthinker/_types/_alias.py extractthinker/cli/cli.py extractthinker/cli/files.py extractthinker/cli/usage.py extractthinker/exceptions.py" && | ||
export CUSTOM_FLAGS="--python-version=3.9 --color-output --no-pretty --follow-imports=skip" && | ||
curl -sSL https://raw.githubusercontent.com/gao-hongnan/omniverse/2fd5de1b8103e955cd5f022ab016b72fa901fa8f/scripts/devops/continuous-integration/type_mypy.sh | | ||
bash' | ||
language: system | ||
types: [python] | ||
pass_filenames: false |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
# Exclude a variety of commonly ignored directories. | ||
exclude = [ | ||
".bzr", | ||
".direnv", | ||
".eggs", | ||
".git", | ||
".git-rewrite", | ||
".hg", | ||
".mypy_cache", | ||
".nox", | ||
".pants.d", | ||
".pytype", | ||
".ruff_cache", | ||
".svn", | ||
".tox", | ||
".venv", | ||
"__pypackages__", | ||
"_build", | ||
"buck-out", | ||
"build", | ||
"dist", | ||
"node_modules", | ||
"venv", | ||
] | ||
|
||
# Same as Black. | ||
line-length = 88 | ||
output-format = "grouped" | ||
|
||
target-version = "py39" | ||
|
||
[lint] | ||
select = [ | ||
# bugbear rules | ||
"B", | ||
# remove unused imports | ||
"F401", | ||
# bare except statements | ||
"E722", | ||
# unused arguments | ||
"ARG", | ||
] | ||
ignore = [ | ||
# mutable defaults | ||
"B006", | ||
"B018", | ||
] | ||
|
||
unfixable = [ | ||
# disable auto fix for print statements | ||
"T201", | ||
"T203", | ||
] | ||
ignore-init-module-imports = true | ||
|
||
[extend-per-file-ignores] | ||
"instructor/distil.py" = ["ARG002"] | ||
"tests/test_distil.py" = ["ARG001"] | ||
"tests/test_patch.py" = ["ARG001"] | ||
"examples/task_planner/task_planner_topological_sort.py" = ["ARG002"] | ||
"examples/citation_with_extraction/main.py" = ["ARG001"] | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
{ | ||
// Use IntelliSense to learn about possible attributes. | ||
// Hover to view descriptions of existing attributes. | ||
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 | ||
"version": "0.2.0", | ||
"configurations": [ | ||
{ | ||
"name": "Python Debugger: Current File", | ||
"type": "debugpy", | ||
"request": "launch", | ||
"program": "${file}", | ||
"console": "integratedTerminal" | ||
} | ||
] | ||
} |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
from dotenv import load_dotenv | ||
|
||
from extract_thinker.document_loader.document_loader_tesseract import DocumentLoaderTesseract | ||
from extractor import Extractor | ||
from models import Classification | ||
|
||
|
||
load_dotenv() | ||
|
||
classifications = [ | ||
Classification(name="Driver License", description="This is a driver license"), | ||
Classification(name="Invoice", description="This is an invoice"), | ||
] | ||
|
||
# Usage | ||
extractor = Extractor() | ||
|
||
# extractor.loadSplitter(ImageSplitter()) | ||
# extractor.loadfile( | ||
# "C:\\Users\\Lopez\\Desktop\\MagniFinance\\examples\\outputTestOne.pdf" | ||
# ) | ||
# extractor.split(classifications) | ||
|
||
# extractor.loadfile("C:\\Users\\Lopez\\Desktop\\MagniFinance\\examples\\outputTestOne.pdf").split(classifications) | ||
|
||
extractor.load_document_loader( | ||
DocumentLoaderTesseract("C:\\Program Files\\Tesseract-OCR\\tesseract.exe") | ||
) | ||
extractor.load_llm("claude-3-haiku-20240307") | ||
|
||
# extractor.classify_from_path( | ||
# "C:\\Users\\Lopez\\Desktop\\ExtractThinker\\driverLicense.jpg", | ||
# classifications | ||
# ) | ||
|
||
# extractor.loadfile( | ||
# "C:\\Users\\Lopez\\Desktop\\ExtractThinker\\driverLicense.jpg" | ||
# )\ | ||
# .split(classifications)\ | ||
# .extract()\ | ||
# .where(lambda x: x.name == "Driver License")\ | ||
|
||
# user_info = extractor.extract_from_file( | ||
# 'C:\\Users\\Lopez\\Desktop\\ExtractThinker\\driverLicense.jpg', UserContract, vision=True) | ||
|
||
# print(user_info.name) | ||
# print(user_info.age) | ||
|
||
# the equivalent of this for the instructor: | ||
|
||
# equivalent for this, inside instructor: json.loads(json_string) |
11 changes: 11 additions & 0 deletions
11
extract_thinker/document_loader/azure_form_recognizer_loader.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
from extract_thinker.document_loader.document_loader import DocumentLoader | ||
|
||
|
||
class AzureFormRecognizerLoader(DocumentLoader): | ||
def load_content_from_file(self, file_path): | ||
# Implement this method for Azure Form Recognizer | ||
pass | ||
|
||
def load_content_from_stream(self, stream): | ||
# Implement this method for Azure Form Recognizer | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
from extract_thinker.document_loader.document_loader import DocumentLoader | ||
|
||
|
||
class DocTRLoader(DocumentLoader): | ||
def load_content_from_file(self, file_path): | ||
# Implement this method for DocTR | ||
pass | ||
|
||
def load_content_from_stream(self, stream): | ||
# Implement this method for DocTR | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
from abc import ABC, abstractmethod | ||
from PIL import Image | ||
from io import BytesIO | ||
import pypdfium2 as pdfium | ||
import concurrent.futures | ||
from typing import Any, Dict, List, Union | ||
|
||
|
||
class DocumentLoader(ABC): | ||
def __init__(self, content: Any = None): | ||
self.content = content | ||
self.file_path = None | ||
|
||
@abstractmethod | ||
def load_content_from_file(self, file_path: str) -> Union[str, object]: | ||
pass | ||
|
||
@abstractmethod | ||
def load_content_from_stream(self, stream: BytesIO) -> Union[str, object]: | ||
pass | ||
|
||
def getContent(self) -> Any: | ||
return self.content | ||
|
||
def convert_pdf_to_images(self, file_path: str, scale: float = 300 / 72) -> List[Dict[int, bytes]]: | ||
# Check if the file is already an image | ||
try: | ||
Image.open(file_path) | ||
is_image = True | ||
except IOError: | ||
is_image = False | ||
|
||
if is_image: | ||
# If it is, return it as is | ||
with open(file_path, "rb") as f: | ||
return [{0: f.read()}] | ||
|
||
# If it's not an image, proceed with the conversion | ||
pdf_file = pdfium.PdfDocument(file_path) | ||
|
||
page_indices = [i for i in range(len(pdf_file))] | ||
|
||
with concurrent.futures.ThreadPoolExecutor() as executor: | ||
futures = [] | ||
for i in page_indices: | ||
future = executor.submit(self.render_page, pdf_file, i, scale) | ||
futures.append(future) | ||
|
||
final_images = [] | ||
for future in concurrent.futures.as_completed(futures): | ||
final_images.append(future.result()) | ||
|
||
return final_images | ||
|
||
@staticmethod | ||
def render_page(pdf_file: pdfium.PdfDocument, page_index: int, scale: float) -> Dict[int, bytes]: | ||
renderer = pdf_file.render( | ||
pdfium.PdfBitmap.to_pil, | ||
page_indices=[page_index], | ||
scale=scale, | ||
) | ||
image_list = list(renderer) | ||
image = image_list[0] | ||
image_byte_array = BytesIO() | ||
image.save(image_byte_array, format="jpeg", optimize=True) | ||
image_byte_array = image_byte_array.getvalue() | ||
return {page_index: image_byte_array} |
49 changes: 49 additions & 0 deletions
49
extract_thinker/document_loader/document_loader_tesseract.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
from io import BytesIO | ||
import os | ||
from typing import Union | ||
from PIL import Image | ||
import pytesseract | ||
|
||
from extract_thinker.document_loader.document_loader import DocumentLoader | ||
|
||
from ..utils import get_image_type | ||
|
||
SUPPORTED_IMAGE_FORMATS = ["jpeg", "png", "bmp", "tiff"] | ||
|
||
|
||
class DocumentLoaderTesseract(DocumentLoader): | ||
def __init__(self, tesseract_cmd, isContainer=False, content=None): | ||
self.content = content | ||
self.tesseract_cmd = tesseract_cmd | ||
if isContainer: | ||
# docker path to tesseract | ||
self.tesseract_cmd = os.environ.get("TESSERACT_PATH", "tesseract") | ||
pytesseract.pytesseract.tesseract_cmd = self.tesseract_cmd | ||
if not os.path.isfile(self.tesseract_cmd): | ||
raise Exception(f"Tesseract not found at {self.tesseract_cmd}") | ||
|
||
def load_content_from_file(self, file_path: str) -> Union[str, object]: | ||
try: | ||
file_type = get_image_type(file_path) | ||
if file_type in SUPPORTED_IMAGE_FORMATS: | ||
image = Image.open(file_path) | ||
raw_text = str(pytesseract.image_to_string(image)) | ||
self.content = raw_text | ||
return self.content | ||
else: | ||
raise Exception(f"Unsupported file type: {file_path}") | ||
except Exception as e: | ||
raise Exception(f"Error processing file: {e}") from e | ||
|
||
def load_content_from_stream(self, stream: Union[BytesIO, str]) -> Union[str, object]: | ||
try: | ||
file_type = get_image_type(stream) | ||
if file_type in SUPPORTED_IMAGE_FORMATS: | ||
image = Image.open(stream) | ||
raw_text = str(pytesseract.image_to_string(image)) | ||
self.content = raw_text | ||
return self.content | ||
else: | ||
raise Exception(f"Unsupported stream type: {stream}") | ||
except Exception as e: | ||
raise Exception(f"Error processing stream: {e}") from e |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
from abc import ABC, abstractmethod | ||
|
||
|
||
class LlmInterceptor(ABC): | ||
@abstractmethod | ||
def process(self, messages: list, response: str) -> None: | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
from abc import ABC, abstractmethod | ||
|
||
|
||
class LoaderInterceptor(ABC): | ||
@abstractmethod | ||
def process(self, file: str, content: str) -> None: | ||
raise NotImplementedError |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
from extract_thinker.document_loader.document_loader import DocumentLoader | ||
|
||
|
||
class TextExtractLoader(DocumentLoader): | ||
def load_content_from_file(self, file_path): | ||
# Implement this method for TextExtract | ||
pass | ||
|
||
def load_content_from_stream(self, stream): | ||
# Implement this method for TextExtract | ||
pass |
Oops, something went wrong.