Skip to content

Commit

Permalink
ExtractThinker first code. Basic tests working
Browse files Browse the repository at this point in the history
  • Loading branch information
enoch3712 committed Apr 30, 2024
1 parent cb5d01d commit 6836b02
Show file tree
Hide file tree
Showing 53 changed files with 911 additions and 1 deletion.
2 changes: 2 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[flake8]
ignore = E501
31 changes: 31 additions & 0 deletions .github/workflows/workflow.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
name: Python package workflow

on:
push:
branches:
- main
pull_request:
branches:
- main

jobs:
build-and-test:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v3
with:
python-version: '3.8'

- name: Install dependencies
run: |
pip install poetry
poetry install
- name: Run tests
run: poetry run pytest

- name: Build package
run: poetry build
23 changes: 23 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.1.7 # Ruff version
hooks:
- id: ruff # Run the linter.
name: Run Linter Check (Ruff)
args: [ --fix ]
files: ^(extractthinker|tests|examples)/
- id: ruff-format # Run the formatter.
name: Run Formatter (Ruff)
- repo: local
hooks:
- id: ci_type_mypy
name: Run Type Check (Mypy)
entry: >
bash -c 'set -o pipefail;
export CUSTOM_PACKAGES="extractthinker/_types/_alias.py extractthinker/cli/cli.py extractthinker/cli/files.py extractthinker/cli/usage.py extractthinker/exceptions.py" &&
export CUSTOM_FLAGS="--python-version=3.9 --color-output --no-pretty --follow-imports=skip" &&
curl -sSL https://raw.githubusercontent.com/gao-hongnan/omniverse/2fd5de1b8103e955cd5f022ab016b72fa901fa8f/scripts/devops/continuous-integration/type_mypy.sh |
bash'
language: system
types: [python]
pass_filenames: false
62 changes: 62 additions & 0 deletions .ruff.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# Exclude a variety of commonly ignored directories.
exclude = [
".bzr",
".direnv",
".eggs",
".git",
".git-rewrite",
".hg",
".mypy_cache",
".nox",
".pants.d",
".pytype",
".ruff_cache",
".svn",
".tox",
".venv",
"__pypackages__",
"_build",
"buck-out",
"build",
"dist",
"node_modules",
"venv",
]

# Same as Black.
line-length = 88
output-format = "grouped"

target-version = "py39"

[lint]
select = [
# bugbear rules
"B",
# remove unused imports
"F401",
# bare except statements
"E722",
# unused arguments
"ARG",
]
ignore = [
# mutable defaults
"B006",
"B018",
]

unfixable = [
# disable auto fix for print statements
"T201",
"T203",
]
ignore-init-module-imports = true

[extend-per-file-ignores]
"instructor/distil.py" = ["ARG002"]
"tests/test_distil.py" = ["ARG001"]
"tests/test_patch.py" = ["ARG001"]
"examples/task_planner/task_planner_topological_sort.py" = ["ARG002"]
"examples/citation_with_extraction/main.py" = ["ARG001"]

15 changes: 15 additions & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Python Debugger: Current File",
"type": "debugpy",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal"
}
]
}
Empty file added extract_thinker/__init__.py
Empty file.
51 changes: 51 additions & 0 deletions extract_thinker/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
from dotenv import load_dotenv

from extract_thinker.document_loader.document_loader_tesseract import DocumentLoaderTesseract
from extractor import Extractor
from models import Classification


load_dotenv()

classifications = [
Classification(name="Driver License", description="This is a driver license"),
Classification(name="Invoice", description="This is an invoice"),
]

# Usage
extractor = Extractor()

# extractor.loadSplitter(ImageSplitter())
# extractor.loadfile(
# "C:\\Users\\Lopez\\Desktop\\MagniFinance\\examples\\outputTestOne.pdf"
# )
# extractor.split(classifications)

# extractor.loadfile("C:\\Users\\Lopez\\Desktop\\MagniFinance\\examples\\outputTestOne.pdf").split(classifications)

extractor.load_document_loader(
DocumentLoaderTesseract("C:\\Program Files\\Tesseract-OCR\\tesseract.exe")
)
extractor.load_llm("claude-3-haiku-20240307")

# extractor.classify_from_path(
# "C:\\Users\\Lopez\\Desktop\\ExtractThinker\\driverLicense.jpg",
# classifications
# )

# extractor.loadfile(
# "C:\\Users\\Lopez\\Desktop\\ExtractThinker\\driverLicense.jpg"
# )\
# .split(classifications)\
# .extract()\
# .where(lambda x: x.name == "Driver License")\

# user_info = extractor.extract_from_file(
# 'C:\\Users\\Lopez\\Desktop\\ExtractThinker\\driverLicense.jpg', UserContract, vision=True)

# print(user_info.name)
# print(user_info.age)

# the equivalent of this for the instructor:

# equivalent for this, inside instructor: json.loads(json_string)
11 changes: 11 additions & 0 deletions extract_thinker/document_loader/azure_form_recognizer_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from extract_thinker.document_loader.document_loader import DocumentLoader


class AzureFormRecognizerLoader(DocumentLoader):
def load_content_from_file(self, file_path):
# Implement this method for Azure Form Recognizer
pass

def load_content_from_stream(self, stream):
# Implement this method for Azure Form Recognizer
pass
11 changes: 11 additions & 0 deletions extract_thinker/document_loader/doctr_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from extract_thinker.document_loader.document_loader import DocumentLoader


class DocTRLoader(DocumentLoader):
def load_content_from_file(self, file_path):
# Implement this method for DocTR
pass

def load_content_from_stream(self, stream):
# Implement this method for DocTR
pass
67 changes: 67 additions & 0 deletions extract_thinker/document_loader/document_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from abc import ABC, abstractmethod
from PIL import Image
from io import BytesIO
import pypdfium2 as pdfium
import concurrent.futures
from typing import Any, Dict, List, Union


class DocumentLoader(ABC):
def __init__(self, content: Any = None):
self.content = content
self.file_path = None

@abstractmethod
def load_content_from_file(self, file_path: str) -> Union[str, object]:
pass

@abstractmethod
def load_content_from_stream(self, stream: BytesIO) -> Union[str, object]:
pass

def getContent(self) -> Any:
return self.content

def convert_pdf_to_images(self, file_path: str, scale: float = 300 / 72) -> List[Dict[int, bytes]]:
# Check if the file is already an image
try:
Image.open(file_path)
is_image = True
except IOError:
is_image = False

if is_image:
# If it is, return it as is
with open(file_path, "rb") as f:
return [{0: f.read()}]

# If it's not an image, proceed with the conversion
pdf_file = pdfium.PdfDocument(file_path)

page_indices = [i for i in range(len(pdf_file))]

with concurrent.futures.ThreadPoolExecutor() as executor:
futures = []
for i in page_indices:
future = executor.submit(self.render_page, pdf_file, i, scale)
futures.append(future)

final_images = []
for future in concurrent.futures.as_completed(futures):
final_images.append(future.result())

return final_images

@staticmethod
def render_page(pdf_file: pdfium.PdfDocument, page_index: int, scale: float) -> Dict[int, bytes]:
renderer = pdf_file.render(
pdfium.PdfBitmap.to_pil,
page_indices=[page_index],
scale=scale,
)
image_list = list(renderer)
image = image_list[0]
image_byte_array = BytesIO()
image.save(image_byte_array, format="jpeg", optimize=True)
image_byte_array = image_byte_array.getvalue()
return {page_index: image_byte_array}
49 changes: 49 additions & 0 deletions extract_thinker/document_loader/document_loader_tesseract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from io import BytesIO
import os
from typing import Union
from PIL import Image
import pytesseract

from extract_thinker.document_loader.document_loader import DocumentLoader

from ..utils import get_image_type

SUPPORTED_IMAGE_FORMATS = ["jpeg", "png", "bmp", "tiff"]


class DocumentLoaderTesseract(DocumentLoader):
def __init__(self, tesseract_cmd, isContainer=False, content=None):
self.content = content
self.tesseract_cmd = tesseract_cmd
if isContainer:
# docker path to tesseract
self.tesseract_cmd = os.environ.get("TESSERACT_PATH", "tesseract")
pytesseract.pytesseract.tesseract_cmd = self.tesseract_cmd
if not os.path.isfile(self.tesseract_cmd):
raise Exception(f"Tesseract not found at {self.tesseract_cmd}")

def load_content_from_file(self, file_path: str) -> Union[str, object]:
try:
file_type = get_image_type(file_path)
if file_type in SUPPORTED_IMAGE_FORMATS:
image = Image.open(file_path)
raw_text = str(pytesseract.image_to_string(image))
self.content = raw_text
return self.content
else:
raise Exception(f"Unsupported file type: {file_path}")
except Exception as e:
raise Exception(f"Error processing file: {e}") from e

def load_content_from_stream(self, stream: Union[BytesIO, str]) -> Union[str, object]:
try:
file_type = get_image_type(stream)
if file_type in SUPPORTED_IMAGE_FORMATS:
image = Image.open(stream)
raw_text = str(pytesseract.image_to_string(image))
self.content = raw_text
return self.content
else:
raise Exception(f"Unsupported stream type: {stream}")
except Exception as e:
raise Exception(f"Error processing stream: {e}") from e
7 changes: 7 additions & 0 deletions extract_thinker/document_loader/llm_interceptor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from abc import ABC, abstractmethod


class LlmInterceptor(ABC):
@abstractmethod
def process(self, messages: list, response: str) -> None:
pass
7 changes: 7 additions & 0 deletions extract_thinker/document_loader/loader_interceptor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from abc import ABC, abstractmethod


class LoaderInterceptor(ABC):
@abstractmethod
def process(self, file: str, content: str) -> None:
raise NotImplementedError
11 changes: 11 additions & 0 deletions extract_thinker/document_loader/text_extract_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from extract_thinker.document_loader.document_loader import DocumentLoader


class TextExtractLoader(DocumentLoader):
def load_content_from_file(self, file_path):
# Implement this method for TextExtract
pass

def load_content_from_stream(self, stream):
# Implement this method for TextExtract
pass
Loading

0 comments on commit 6836b02

Please sign in to comment.