solving the stream issue. Google image issue also added

enoch3712 · Dec 23, 2024 · b5180c1 · b5180c1
1 parent 968f8a4
commit b5180c1
Show file tree

Hide file tree

Showing 5 changed files with 173 additions and 10 deletions.
diff --git a/extract_thinker/document_loader/document_loader.py b/extract_thinker/document_loader/document_loader.py
@@ -6,7 +6,8 @@
 from typing import Any, Dict, Union
 from cachetools import TTLCache
 import os
-from extract_thinker.utils import get_file_extension
+import magic
+from extract_thinker.utils import get_file_extension, check_mime_type
 
 class DocumentLoader(ABC):
     def __init__(self, content: Any = None, cache_ttl: int = 300):
@@ -48,11 +49,10 @@ def _can_handle_file_path(self, file_path: str) -> bool:
     def _can_handle_stream(self, stream: BytesIO) -> bool:
         """Checks if the loader can handle the given BytesIO stream."""
         try:
-            stream.seek(0)
-            img = Image.open(stream)
-            file_type = img.format.lower()
-            stream.seek(0)
-            return file_type.lower() in [fmt.lower() for fmt in self.SUPPORTED_FORMATS]
+            # Read the first few bytes to determine file type
+            mime = magic.from_buffer(stream.getvalue(), mime=True)
+            stream.seek(0)  # Reset stream position
+            return check_mime_type(mime, self.SUPPORTED_FORMATS)
         except Exception:
             return False
 

diff --git a/extract_thinker/document_loader/document_loader_google_document_ai.py b/extract_thinker/document_loader/document_loader_google_document_ai.py
@@ -129,8 +129,10 @@ def load(self, source: Union[str, BytesIO]) -> List[Dict[str, Any]]:
 
             # Add image data if in vision mode
             if self.vision_mode and self.can_handle_vision(source):
-                for page_data in pages:
-                    page_data["image"] = document_content  # For PDF, each page would need its own image data
+                images_dict = self.convert_to_images(source)
+                for idx, page_data in enumerate(pages):
+                    if idx in images_dict:
+                        page_data["image"] = images_dict[idx]
 
             return pages
 

diff --git a/extract_thinker/utils.py b/extract_thinker/utils.py
@@ -323,4 +323,153 @@ def add_classification_structure(response_model: type[BaseModel], indent: int =
                 content += f"{tab}Nested structure for {name} values:\n"
                 content += add_classification_structure(value_type, indent + 1)
 
-    return content
+    return content
+
+MIME_TYPE_MAPPING = {
+    # Documents
+    'pdf': 'application/pdf',
+    'doc': 'application/msword',
+    'docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+    'rtf': 'application/rtf',
+    'txt': 'text/plain',
+    'odt': 'application/vnd.oasis.opendocument.text',
+    'tex': 'application/x-tex',
+    'markdown': ['text/markdown', 'text/x-markdown'],
+    'md': ['text/markdown', 'text/x-markdown'],
+
+    # Spreadsheets
+    'xls': 'application/vnd.ms-excel',
+    'xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+    'ods': 'application/vnd.oasis.opendocument.spreadsheet',
+    'csv': ['text/csv', 'application/csv'],
+    'tsv': 'text/tab-separated-values',
+
+    # Presentations
+    'ppt': 'application/vnd.ms-powerpoint',
+    'pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
+    'odp': 'application/vnd.oasis.opendocument.presentation',
+    'key': 'application/vnd.apple.keynote',
+
+    # Images
+    'jpg': ['image/jpeg', 'image/jpg'],
+    'jpeg': ['image/jpeg', 'image/jpg'],
+    'png': 'image/png',
+    'gif': 'image/gif',
+    'bmp': 'image/bmp',
+    'tiff': 'image/tiff',
+    'tif': 'image/tiff',
+    'webp': 'image/webp',
+    'svg': ['image/svg+xml', 'application/svg+xml'],
+    'ico': 'image/x-icon',
+    'raw': 'image/x-raw',
+    'heic': 'image/heic',
+    'heif': 'image/heif',
+
+    # Web
+    'html': ['text/html', 'application/xhtml+xml'],
+    'htm': ['text/html', 'application/xhtml+xml'],
+    'xhtml': 'application/xhtml+xml',
+    'xml': ['application/xml', 'text/xml'],
+    'json': 'application/json',
+    'yaml': ['application/yaml', 'text/yaml'],
+    'yml': ['application/yaml', 'text/yaml'],
+
+    # Archives
+    'zip': 'application/zip',
+    'rar': 'application/x-rar-compressed',
+    '7z': 'application/x-7z-compressed',
+    'tar': 'application/x-tar',
+    'gz': 'application/gzip',
+
+    # Audio
+    'mp3': 'audio/mpeg',
+    'wav': 'audio/wav',
+    'ogg': 'audio/ogg',
+    'flac': 'audio/flac',
+    'm4a': 'audio/mp4',
+    'aac': 'audio/aac',
+
+    # Video
+    'mp4': 'video/mp4',
+    'avi': 'video/x-msvideo',
+    'mkv': 'video/x-matroska',
+    'mov': 'video/quicktime',
+    'wmv': 'video/x-ms-wmv',
+    'flv': 'video/x-flv',
+    'webm': 'video/webm',
+
+    # Ebooks
+    'epub': 'application/epub+zip',
+    'mobi': 'application/x-mobipocket-ebook',
+    'azw': 'application/vnd.amazon.ebook',
+    'azw3': 'application/vnd.amazon.ebook',
+
+    # CAD and 3D
+    'dwg': 'application/acad',
+    'dxf': 'application/dxf',
+    'stl': 'model/stl',
+    'obj': 'model/obj',
+
+    # Fonts
+    'ttf': 'font/ttf',
+    'otf': 'font/otf',
+    'woff': 'font/woff',
+    'woff2': 'font/woff2',
+
+    # Programming
+    'py': 'text/x-python',
+    'js': 'text/javascript',
+    'css': 'text/css',
+    'java': 'text/x-java-source',
+    'cpp': 'text/x-c++src',
+    'c': 'text/x-c',
+    'swift': 'text/x-swift',
+    'go': 'text/x-go',
+    'rs': 'text/x-rust',
+
+    # Database
+    'sql': 'application/sql',
+    'db': 'application/x-sqlite3',
+    'sqlite': 'application/x-sqlite3',
+
+    # Email
+    'eml': 'message/rfc822',
+    'msg': 'application/vnd.ms-outlook',
+
+    # Scientific/Technical
+    'nb': 'application/mathematica',
+    'mat': 'application/x-matlab-data',
+    'r': 'text/x-r',
+    'tex': 'application/x-tex',
+
+    # Configuration
+    'ini': 'text/plain',
+    'conf': 'text/plain',
+    'toml': 'application/toml',
+
+    # Vector Graphics
+    'ai': 'application/postscript',
+    'eps': 'application/postscript',
+    'ps': 'application/postscript',
+}
+
+def check_mime_type(mime: str, supported_formats: List[str]) -> bool:
+    """
+    Check if a MIME type matches any of the supported formats.
+    
+    Args:
+        mime: The MIME type to check
+        supported_formats: List of supported format extensions
+        
+    Returns:
+        bool: True if the MIME type matches any supported format
+    """
+    for fmt in supported_formats:
+        expected_mime = MIME_TYPE_MAPPING.get(fmt.lower())
+        if expected_mime:
+            if isinstance(expected_mime, list):
+                if mime in expected_mime:
+                    return True
+            elif mime == expected_mime:
+                return True
+    return False
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -26,6 +26,7 @@ boto3 = "^1.34.161"
 pdfplumber = "^0.11.4"
 pypdf = "^5.1.0"
 docx2txt = "^0.8"
+python-magic = "^0.4.27"
 
 [tool.poetry.dev-dependencies]
 flake8 = "^3.9.2"