ExtractThinker first code. Basic tests working

enoch3712 · Apr 30, 2024 · 6836b02 · 6836b02
1 parent cb5d01d
commit 6836b02
Show file tree

Hide file tree

Showing 53 changed files with 911 additions and 1 deletion.
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,2 @@
+[flake8]
+ignore = E501
diff --git a/.github/workflows/workflow.yml b/.github/workflows/workflow.yml
@@ -0,0 +1,31 @@
+name: Python package workflow
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  build-and-test:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python
+      uses: actions/setup-python@v3
+      with:
+        python-version: '3.8'
+
+    - name: Install dependencies
+      run: |
+        pip install poetry
+        poetry install
+
+    - name: Run tests
+      run: poetry run pytest
+
+    - name: Build package
+      run: poetry build
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,23 @@
+repos:
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.1.7 # Ruff version
+    hooks:
+      - id: ruff # Run the linter.
+        name: Run Linter Check (Ruff)
+        args: [ --fix ]
+        files: ^(extractthinker|tests|examples)/
+      - id: ruff-format       # Run the formatter.
+        name: Run Formatter (Ruff)
+  - repo: local
+    hooks:
+      - id: ci_type_mypy
+        name: Run Type Check (Mypy)
+        entry: >
+            bash -c 'set -o pipefail;
+            export CUSTOM_PACKAGES="extractthinker/_types/_alias.py extractthinker/cli/cli.py extractthinker/cli/files.py extractthinker/cli/usage.py extractthinker/exceptions.py" &&
+            export CUSTOM_FLAGS="--python-version=3.9 --color-output --no-pretty --follow-imports=skip" &&
+            curl -sSL https://raw.githubusercontent.com/gao-hongnan/omniverse/2fd5de1b8103e955cd5f022ab016b72fa901fa8f/scripts/devops/continuous-integration/type_mypy.sh |
+            bash'
+        language: system
+        types: [python]
+        pass_filenames: false
diff --git a/.ruff.toml b/.ruff.toml
@@ -0,0 +1,62 @@
+# Exclude a variety of commonly ignored directories.
+exclude = [
+    ".bzr",
+    ".direnv",
+    ".eggs",
+    ".git",
+    ".git-rewrite",
+    ".hg",
+    ".mypy_cache",
+    ".nox",
+    ".pants.d",
+    ".pytype",
+    ".ruff_cache",
+    ".svn",
+    ".tox",
+    ".venv",
+    "__pypackages__",
+    "_build",
+    "buck-out",
+    "build",
+    "dist",
+    "node_modules",
+    "venv",
+]
+
+# Same as Black.
+line-length = 88
+output-format = "grouped"
+
+target-version = "py39"
+
+[lint]
+select = [
+  # bugbear rules
+  "B",
+  # remove unused imports
+  "F401",
+  # bare except statements
+  "E722",
+  # unused arguments
+  "ARG",
+]
+ignore = [
+  # mutable defaults
+  "B006",
+  "B018",
+]
+
+unfixable = [
+  # disable auto fix for print statements
+  "T201",
+  "T203",
+]
+ignore-init-module-imports = true
+
+[extend-per-file-ignores]
+"instructor/distil.py" = ["ARG002"]
+"tests/test_distil.py" = ["ARG001"]
+"tests/test_patch.py" = ["ARG001"]
+"examples/task_planner/task_planner_topological_sort.py" = ["ARG002"]
+"examples/citation_with_extraction/main.py" = ["ARG001"]
+
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -0,0 +1,15 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python Debugger: Current File",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal"
+        }
+    ]
+}
diff --git a/extract_thinker/__init__.py b/extract_thinker/__init__.py
diff --git a/extract_thinker/app.py b/extract_thinker/app.py
@@ -0,0 +1,51 @@
+from dotenv import load_dotenv
+
+from extract_thinker.document_loader.document_loader_tesseract import DocumentLoaderTesseract
+from extractor import Extractor
+from models import Classification
+
+
+load_dotenv()
+
+classifications = [
+    Classification(name="Driver License", description="This is a driver license"),
+    Classification(name="Invoice", description="This is an invoice"),
+]
+
+# Usage
+extractor = Extractor()
+
+# extractor.loadSplitter(ImageSplitter())
+# extractor.loadfile(
+#     "C:\\Users\\Lopez\\Desktop\\MagniFinance\\examples\\outputTestOne.pdf"
+# )
+# extractor.split(classifications)
+
+# extractor.loadfile("C:\\Users\\Lopez\\Desktop\\MagniFinance\\examples\\outputTestOne.pdf").split(classifications)
+
+extractor.load_document_loader(
+    DocumentLoaderTesseract("C:\\Program Files\\Tesseract-OCR\\tesseract.exe")
+)
+extractor.load_llm("claude-3-haiku-20240307")
+
+# extractor.classify_from_path(
+#     "C:\\Users\\Lopez\\Desktop\\ExtractThinker\\driverLicense.jpg",
+#     classifications
+# )
+
+# extractor.loadfile(
+#     "C:\\Users\\Lopez\\Desktop\\ExtractThinker\\driverLicense.jpg"
+#     )\
+#     .split(classifications)\
+#     .extract()\
+#     .where(lambda x: x.name == "Driver License")\
+
+# user_info = extractor.extract_from_file(
+#     'C:\\Users\\Lopez\\Desktop\\ExtractThinker\\driverLicense.jpg', UserContract, vision=True)
+
+# print(user_info.name)
+# print(user_info.age)
+
+# the equivalent of this for the instructor:
+
+# equivalent for this, inside instructor: json.loads(json_string)
diff --git a/extract_thinker/document_loader/azure_form_recognizer_loader.py b/extract_thinker/document_loader/azure_form_recognizer_loader.py
@@ -0,0 +1,11 @@
+from extract_thinker.document_loader.document_loader import DocumentLoader
+
+
+class AzureFormRecognizerLoader(DocumentLoader):
+    def load_content_from_file(self, file_path):
+        # Implement this method for Azure Form Recognizer
+        pass
+
+    def load_content_from_stream(self, stream):
+        # Implement this method for Azure Form Recognizer
+        pass
diff --git a/extract_thinker/document_loader/doctr_loader.py b/extract_thinker/document_loader/doctr_loader.py
@@ -0,0 +1,11 @@
+from extract_thinker.document_loader.document_loader import DocumentLoader
+
+
+class DocTRLoader(DocumentLoader):
+    def load_content_from_file(self, file_path):
+        # Implement this method for DocTR
+        pass
+
+    def load_content_from_stream(self, stream):
+        # Implement this method for DocTR
+        pass
diff --git a/extract_thinker/document_loader/document_loader.py b/extract_thinker/document_loader/document_loader.py
@@ -0,0 +1,67 @@
+from abc import ABC, abstractmethod
+from PIL import Image
+from io import BytesIO
+import pypdfium2 as pdfium
+import concurrent.futures
+from typing import Any, Dict, List, Union
+
+
+class DocumentLoader(ABC):
+    def __init__(self, content: Any = None):
+        self.content = content
+        self.file_path = None
+
+    @abstractmethod
+    def load_content_from_file(self, file_path: str) -> Union[str, object]:
+        pass
+
+    @abstractmethod
+    def load_content_from_stream(self, stream: BytesIO) -> Union[str, object]:
+        pass
+
+    def getContent(self) -> Any:
+        return self.content
+
+    def convert_pdf_to_images(self, file_path: str, scale: float = 300 / 72) -> List[Dict[int, bytes]]:
+        # Check if the file is already an image
+        try:
+            Image.open(file_path)
+            is_image = True
+        except IOError:
+            is_image = False
+
+        if is_image:
+            # If it is, return it as is
+            with open(file_path, "rb") as f:
+                return [{0: f.read()}]
+
+        # If it's not an image, proceed with the conversion
+        pdf_file = pdfium.PdfDocument(file_path)
+
+        page_indices = [i for i in range(len(pdf_file))]
+
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            futures = []
+            for i in page_indices:
+                future = executor.submit(self.render_page, pdf_file, i, scale)
+                futures.append(future)
+
+            final_images = []
+            for future in concurrent.futures.as_completed(futures):
+                final_images.append(future.result())
+
+        return final_images
+
+    @staticmethod
+    def render_page(pdf_file: pdfium.PdfDocument, page_index: int, scale: float) -> Dict[int, bytes]:
+        renderer = pdf_file.render(
+            pdfium.PdfBitmap.to_pil,
+            page_indices=[page_index],
+            scale=scale,
+        )
+        image_list = list(renderer)
+        image = image_list[0]
+        image_byte_array = BytesIO()
+        image.save(image_byte_array, format="jpeg", optimize=True)
+        image_byte_array = image_byte_array.getvalue()
+        return {page_index: image_byte_array}
diff --git a/extract_thinker/document_loader/document_loader_tesseract.py b/extract_thinker/document_loader/document_loader_tesseract.py
@@ -0,0 +1,49 @@
+from io import BytesIO
+import os
+from typing import Union
+from PIL import Image
+import pytesseract
+
+from extract_thinker.document_loader.document_loader import DocumentLoader
+
+from ..utils import get_image_type
+
+SUPPORTED_IMAGE_FORMATS = ["jpeg", "png", "bmp", "tiff"]
+
+
+class DocumentLoaderTesseract(DocumentLoader):
+    def __init__(self, tesseract_cmd, isContainer=False, content=None):
+        self.content = content
+        self.tesseract_cmd = tesseract_cmd
+        if isContainer:
+            # docker path to tesseract
+            self.tesseract_cmd = os.environ.get("TESSERACT_PATH", "tesseract")
+        pytesseract.pytesseract.tesseract_cmd = self.tesseract_cmd
+        if not os.path.isfile(self.tesseract_cmd):
+            raise Exception(f"Tesseract not found at {self.tesseract_cmd}")
+
+    def load_content_from_file(self, file_path: str) -> Union[str, object]:
+        try:
+            file_type = get_image_type(file_path)
+            if file_type in SUPPORTED_IMAGE_FORMATS:
+                image = Image.open(file_path)
+                raw_text = str(pytesseract.image_to_string(image))
+                self.content = raw_text
+                return self.content
+            else:
+                raise Exception(f"Unsupported file type: {file_path}")
+        except Exception as e:
+            raise Exception(f"Error processing file: {e}") from e
+
+    def load_content_from_stream(self, stream: Union[BytesIO, str]) -> Union[str, object]:
+        try:
+            file_type = get_image_type(stream)
+            if file_type in SUPPORTED_IMAGE_FORMATS:
+                image = Image.open(stream)
+                raw_text = str(pytesseract.image_to_string(image))
+                self.content = raw_text
+                return self.content
+            else:
+                raise Exception(f"Unsupported stream type: {stream}")
+        except Exception as e:
+            raise Exception(f"Error processing stream: {e}") from e
diff --git a/extract_thinker/document_loader/llm_interceptor.py b/extract_thinker/document_loader/llm_interceptor.py
@@ -0,0 +1,7 @@
+from abc import ABC, abstractmethod
+
+
+class LlmInterceptor(ABC):
+    @abstractmethod
+    def process(self, messages: list, response: str) -> None:
+        pass
diff --git a/extract_thinker/document_loader/loader_interceptor.py b/extract_thinker/document_loader/loader_interceptor.py
@@ -0,0 +1,7 @@
+from abc import ABC, abstractmethod
+
+
+class LoaderInterceptor(ABC):
+    @abstractmethod
+    def process(self, file: str, content: str) -> None:
+        raise NotImplementedError
diff --git a/extract_thinker/document_loader/text_extract_loader.py b/extract_thinker/document_loader/text_extract_loader.py
@@ -0,0 +1,11 @@
+from extract_thinker.document_loader.document_loader import DocumentLoader
+
+
+class TextExtractLoader(DocumentLoader):
+    def load_content_from_file(self, file_path):
+        # Implement this method for TextExtract
+        pass
+
+    def load_content_from_stream(self, stream):
+        # Implement this method for TextExtract
+        pass