Skip to content

Commit

Permalink
solving the stream issue. Google image issue also added
Browse files Browse the repository at this point in the history
  • Loading branch information
enoch3712 committed Dec 23, 2024
1 parent 968f8a4 commit b5180c1
Show file tree
Hide file tree
Showing 5 changed files with 173 additions and 10 deletions.
12 changes: 6 additions & 6 deletions extract_thinker/document_loader/document_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
from typing import Any, Dict, Union
from cachetools import TTLCache
import os
from extract_thinker.utils import get_file_extension
import magic
from extract_thinker.utils import get_file_extension, check_mime_type

class DocumentLoader(ABC):
def __init__(self, content: Any = None, cache_ttl: int = 300):
Expand Down Expand Up @@ -48,11 +49,10 @@ def _can_handle_file_path(self, file_path: str) -> bool:
def _can_handle_stream(self, stream: BytesIO) -> bool:
"""Checks if the loader can handle the given BytesIO stream."""
try:
stream.seek(0)
img = Image.open(stream)
file_type = img.format.lower()
stream.seek(0)
return file_type.lower() in [fmt.lower() for fmt in self.SUPPORTED_FORMATS]
# Read the first few bytes to determine file type
mime = magic.from_buffer(stream.getvalue(), mime=True)
stream.seek(0) # Reset stream position
return check_mime_type(mime, self.SUPPORTED_FORMATS)
except Exception:
return False

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,10 @@ def load(self, source: Union[str, BytesIO]) -> List[Dict[str, Any]]:

# Add image data if in vision mode
if self.vision_mode and self.can_handle_vision(source):
for page_data in pages:
page_data["image"] = document_content # For PDF, each page would need its own image data
images_dict = self.convert_to_images(source)
for idx, page_data in enumerate(pages):
if idx in images_dict:
page_data["image"] = images_dict[idx]

return pages

Expand Down
151 changes: 150 additions & 1 deletion extract_thinker/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,4 +323,153 @@ def add_classification_structure(response_model: type[BaseModel], indent: int =
content += f"{tab}Nested structure for {name} values:\n"
content += add_classification_structure(value_type, indent + 1)

return content
return content

MIME_TYPE_MAPPING = {
# Documents
'pdf': 'application/pdf',
'doc': 'application/msword',
'docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'rtf': 'application/rtf',
'txt': 'text/plain',
'odt': 'application/vnd.oasis.opendocument.text',
'tex': 'application/x-tex',
'markdown': ['text/markdown', 'text/x-markdown'],
'md': ['text/markdown', 'text/x-markdown'],

# Spreadsheets
'xls': 'application/vnd.ms-excel',
'xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'ods': 'application/vnd.oasis.opendocument.spreadsheet',
'csv': ['text/csv', 'application/csv'],
'tsv': 'text/tab-separated-values',

# Presentations
'ppt': 'application/vnd.ms-powerpoint',
'pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
'odp': 'application/vnd.oasis.opendocument.presentation',
'key': 'application/vnd.apple.keynote',

# Images
'jpg': ['image/jpeg', 'image/jpg'],
'jpeg': ['image/jpeg', 'image/jpg'],
'png': 'image/png',
'gif': 'image/gif',
'bmp': 'image/bmp',
'tiff': 'image/tiff',
'tif': 'image/tiff',
'webp': 'image/webp',
'svg': ['image/svg+xml', 'application/svg+xml'],
'ico': 'image/x-icon',
'raw': 'image/x-raw',
'heic': 'image/heic',
'heif': 'image/heif',

# Web
'html': ['text/html', 'application/xhtml+xml'],
'htm': ['text/html', 'application/xhtml+xml'],
'xhtml': 'application/xhtml+xml',
'xml': ['application/xml', 'text/xml'],
'json': 'application/json',
'yaml': ['application/yaml', 'text/yaml'],
'yml': ['application/yaml', 'text/yaml'],

# Archives
'zip': 'application/zip',
'rar': 'application/x-rar-compressed',
'7z': 'application/x-7z-compressed',
'tar': 'application/x-tar',
'gz': 'application/gzip',

# Audio
'mp3': 'audio/mpeg',
'wav': 'audio/wav',
'ogg': 'audio/ogg',
'flac': 'audio/flac',
'm4a': 'audio/mp4',
'aac': 'audio/aac',

# Video
'mp4': 'video/mp4',
'avi': 'video/x-msvideo',
'mkv': 'video/x-matroska',
'mov': 'video/quicktime',
'wmv': 'video/x-ms-wmv',
'flv': 'video/x-flv',
'webm': 'video/webm',

# Ebooks
'epub': 'application/epub+zip',
'mobi': 'application/x-mobipocket-ebook',
'azw': 'application/vnd.amazon.ebook',
'azw3': 'application/vnd.amazon.ebook',

# CAD and 3D
'dwg': 'application/acad',
'dxf': 'application/dxf',
'stl': 'model/stl',
'obj': 'model/obj',

# Fonts
'ttf': 'font/ttf',
'otf': 'font/otf',
'woff': 'font/woff',
'woff2': 'font/woff2',

# Programming
'py': 'text/x-python',
'js': 'text/javascript',
'css': 'text/css',
'java': 'text/x-java-source',
'cpp': 'text/x-c++src',
'c': 'text/x-c',
'swift': 'text/x-swift',
'go': 'text/x-go',
'rs': 'text/x-rust',

# Database
'sql': 'application/sql',
'db': 'application/x-sqlite3',
'sqlite': 'application/x-sqlite3',

# Email
'eml': 'message/rfc822',
'msg': 'application/vnd.ms-outlook',

# Scientific/Technical
'nb': 'application/mathematica',
'mat': 'application/x-matlab-data',
'r': 'text/x-r',
'tex': 'application/x-tex',

# Configuration
'ini': 'text/plain',
'conf': 'text/plain',
'toml': 'application/toml',

# Vector Graphics
'ai': 'application/postscript',
'eps': 'application/postscript',
'ps': 'application/postscript',
}

def check_mime_type(mime: str, supported_formats: List[str]) -> bool:
"""
Check if a MIME type matches any of the supported formats.
Args:
mime: The MIME type to check
supported_formats: List of supported format extensions
Returns:
bool: True if the MIME type matches any supported format
"""
for fmt in supported_formats:
expected_mime = MIME_TYPE_MAPPING.get(fmt.lower())
if expected_mime:
if isinstance(expected_mime, list):
if mime in expected_mime:
return True
elif mime == expected_mime:
return True
return False
13 changes: 12 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ boto3 = "^1.34.161"
pdfplumber = "^0.11.4"
pypdf = "^5.1.0"
docx2txt = "^0.8"
python-magic = "^0.4.27"

[tool.poetry.dev-dependencies]
flake8 = "^3.9.2"
Expand Down

0 comments on commit b5180c1

Please sign in to comment.