-
Notifications
You must be signed in to change notification settings - Fork 106
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Loading status checks…
Merge pull request #212 from enoch3712/211-new-feature-extract-from-s…
…tring Document Loader for already processed content
Showing
3 changed files
with
332 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
# Data Document Loader | ||
|
||
The Data loader is a specialized loader that handles pre-processed data in a standardized format. It provides caching support and vision mode compatibility. | ||
|
||
## Supported Format | ||
|
||
The loader expects data in the following standard format: | ||
```python | ||
[ | ||
{ | ||
"content": "...some text...", | ||
"image": None or [] or bytes | ||
} | ||
] | ||
``` | ||
|
||
## Usage | ||
|
||
### Basic Usage | ||
|
||
```python | ||
from extract_thinker import DocumentLoaderData | ||
|
||
# Initialize with default settings | ||
loader = DocumentLoaderData() | ||
|
||
# Load pre-formatted data | ||
data = [{"content": "Sample text", "image": None}] | ||
pages = loader.load(data) | ||
|
||
# Process content | ||
for page in pages: | ||
# Access text content | ||
text = page["content"] | ||
# Access image data if present | ||
image = page["image"] | ||
``` | ||
|
||
### Configuration-based Usage | ||
|
||
```python | ||
from extract_thinker import DocumentLoaderData, DataLoaderConfig | ||
|
||
# Create configuration | ||
config = DataLoaderConfig( | ||
content=None, # Initial content | ||
cache_ttl=600, # Cache results for 10 minutes | ||
supports_vision=True # Enable vision support | ||
) | ||
|
||
# Initialize loader with configuration | ||
loader = DocumentLoaderData(config) | ||
|
||
# Load and process content | ||
pages = loader.load("raw text content") | ||
``` | ||
|
||
## Configuration Options | ||
|
||
The `DataLoaderConfig` class supports the following options: | ||
|
||
| Option | Type | Default | Description | | ||
|--------|------|---------|-------------| | ||
| `content` | Any | None | Initial content to process | | ||
| `cache_ttl` | int | 300 | Cache time-to-live in seconds | | ||
| `supports_vision` | bool | True | Whether vision mode is supported | | ||
|
||
## Features | ||
|
||
- Handles pre-processed data in standard format | ||
- Supports raw text input | ||
- File and stream processing | ||
- Vision mode support | ||
- Automatic format validation | ||
- Caching support | ||
- Flexible input handling | ||
|
||
## Input Types | ||
|
||
The loader can handle: | ||
- Pre-formatted list of dictionaries | ||
- Raw text strings | ||
- File paths | ||
- IO streams | ||
|
||
## Notes | ||
|
||
- Vision mode is configurable | ||
- Validates input format automatically | ||
- Caches results based on content hash | ||
- Supports both file and stream-based loading | ||
- Preserves image data when in vision mode |
144 changes: 144 additions & 0 deletions
144
extract_thinker/document_loader/document_loader_data.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,144 @@ | ||
from typing import Any, Dict, List, Union, Optional, IO | ||
from cachetools import cachedmethod | ||
from cachetools.keys import hashkey | ||
from operator import attrgetter | ||
from extract_thinker.document_loader.cached_document_loader import CachedDocumentLoader | ||
from dataclasses import dataclass | ||
|
||
@dataclass | ||
class DataLoaderConfig: | ||
"""Configuration for Data loader. | ||
Args: | ||
content: Initial content (optional) | ||
cache_ttl: Cache time-to-live in seconds (default: 300) | ||
supports_vision: Whether this loader supports vision mode (default: True) | ||
""" | ||
content: Optional[Any] = None | ||
cache_ttl: int = 300 | ||
supports_vision: bool = True | ||
|
||
class DocumentLoaderData(CachedDocumentLoader): | ||
""" | ||
Document loader that handles pre-processed data with caching support. | ||
Expects data in standard format: | ||
[ | ||
{ | ||
"content": "...some text...", | ||
"image": None or [] or bytes | ||
} | ||
] | ||
""" | ||
|
||
def __init__(self, | ||
content: Optional[Any] = None, | ||
cache_ttl: int = 300, | ||
supports_vision: bool = True): | ||
"""Initialize loader with optional content and cache settings.""" | ||
self.config = DataLoaderConfig( | ||
content=content, | ||
cache_ttl=cache_ttl, | ||
supports_vision=supports_vision | ||
) | ||
super().__init__(self.config.content, self.config.cache_ttl) | ||
self._supports_vision = self.config.supports_vision | ||
|
||
def can_handle(self, source: Any) -> bool: | ||
"""Check if we can handle this source type.""" | ||
if isinstance(source, str): | ||
return True | ||
if hasattr(source, "read"): | ||
return True | ||
if isinstance(source, list) and all(isinstance(item, dict) for item in source): | ||
return True | ||
return False | ||
|
||
@cachedmethod(cache=attrgetter('cache'), | ||
key=lambda self, source: hashkey( | ||
source if isinstance(source, str) | ||
else source.getvalue() if hasattr(source, 'getvalue') | ||
else str(source), | ||
self.vision_mode)) | ||
def load(self, source: Union[str, IO, List[Dict[str, Any]]]) -> List[Dict[str, Any]]: | ||
""" | ||
Load and process content with caching support. | ||
Returns a list of pages in standard format. | ||
Args: | ||
source: String, IO stream, or pre-formatted list of dicts | ||
Returns: | ||
List[Dict[str, Any]]: List of pages with content and image | ||
""" | ||
if not self.can_handle(source): | ||
raise ValueError("Can only handle str, readable streams, or list of dicts") | ||
|
||
try: | ||
if isinstance(source, list): | ||
return self._validate_and_format_list(source) | ||
elif isinstance(source, str): | ||
return self._load_from_string(source) | ||
elif hasattr(source, "read"): | ||
return self._load_from_stream(source) | ||
|
||
except Exception as e: | ||
raise ValueError(f"Error processing content: {str(e)}") | ||
|
||
def _validate_and_format_list(self, data: List[Dict[str, Any]]) -> List[Dict[str, Any]]: | ||
"""Validate and format a list of dictionaries.""" | ||
formatted_pages = [] | ||
for item in data: | ||
if not isinstance(item, dict): | ||
raise ValueError( | ||
"Invalid format. Expected a list of dictionaries in format: " | ||
"[{'content': 'your text here', 'image': None or [] or bytes}, ...]. " | ||
f"Got item of type {type(item).__name__} instead of dict" | ||
) | ||
if "content" not in item: | ||
raise ValueError( | ||
"Invalid format. Each dictionary must have a 'content' field. " | ||
"Expected format: [{'content': 'your text here', 'image': None or [] or bytes}, ...]. " | ||
f"Got keys: {list(item.keys())}" | ||
) | ||
|
||
# Preserve the original image value if present, otherwise use vision mode default | ||
image_value = item.get("image", [] if self.vision_mode else None) | ||
page = { | ||
"content": item["content"], | ||
"image": image_value | ||
} | ||
formatted_pages.append(page) | ||
return formatted_pages | ||
|
||
def _load_from_string(self, text: str) -> List[Dict[str, Any]]: | ||
"""Process string input.""" | ||
try: | ||
# Try to read as file first | ||
with open(text, "r", encoding="utf-8") as f: | ||
content = f.read() | ||
except (FileNotFoundError, IOError): | ||
# If not a file, treat as raw text | ||
content = text | ||
|
||
return [{ | ||
"content": content, | ||
"image": [] if self.vision_mode else None | ||
}] | ||
|
||
def _load_from_stream(self, stream: IO) -> List[Dict[str, Any]]: | ||
"""Process stream input.""" | ||
try: | ||
content = stream.read() | ||
if isinstance(content, bytes): | ||
content = content.decode('utf-8') | ||
|
||
return [{ | ||
"content": content, | ||
"image": [] if self.vision_mode else None | ||
}] | ||
except Exception as e: | ||
raise ValueError(f"Failed to read from stream: {str(e)}") | ||
|
||
def can_handle_vision(self, source: Union[str, IO]) -> bool: | ||
"""Check if this loader can handle the source in vision mode.""" | ||
return self._supports_vision and self.can_handle(source) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
import pytest | ||
from io import StringIO, BytesIO | ||
from extract_thinker.document_loader.document_loader_data import DocumentLoaderData, DataLoaderConfig | ||
|
||
class TestDocumentLoaderData: | ||
@pytest.fixture | ||
def data_config(self): | ||
"""Default Data configuration for testing""" | ||
return DataLoaderConfig( | ||
cache_ttl=300, | ||
supports_vision=True | ||
) | ||
|
||
@pytest.fixture | ||
def loader(self, data_config): | ||
return DocumentLoaderData( | ||
cache_ttl=data_config.cache_ttl, | ||
supports_vision=data_config.supports_vision | ||
) | ||
|
||
@pytest.fixture | ||
def test_data(self): | ||
return [{ | ||
"content": "Sample text", | ||
"image": None | ||
}] | ||
|
||
def test_preformatted_data(self, loader, test_data): | ||
"""Test handling of pre-formatted data""" | ||
pages = loader.load(test_data) | ||
|
||
assert isinstance(pages, list) | ||
assert len(pages) == 1 | ||
assert pages[0]["content"] == "Sample text" | ||
assert "image" in pages[0] | ||
assert pages[0]["image"] is None | ||
|
||
def test_vision_support(self, loader): | ||
"""Test vision mode handling""" | ||
# Vision mode should be configurable | ||
assert loader._supports_vision is True | ||
assert loader.can_handle_vision("test") is True | ||
|
||
# Create loader with vision disabled | ||
no_vision_loader = DocumentLoaderData(supports_vision=False) | ||
assert no_vision_loader._supports_vision is False | ||
assert no_vision_loader.can_handle_vision("test") is False | ||
|
||
def test_string_input(self, loader): | ||
"""Test handling of string input""" | ||
text = "Hello world" | ||
pages = loader.load(text) | ||
|
||
assert len(pages) == 1 | ||
assert pages[0]["content"] == text | ||
assert pages[0]["image"] is None | ||
|
||
def test_stream_input(self, loader): | ||
"""Test handling of stream input""" | ||
text = "Stream content" | ||
stream = StringIO(text) | ||
pages = loader.load(stream) | ||
|
||
assert len(pages) == 1 | ||
assert pages[0]["content"] == text | ||
assert pages[0]["image"] is None | ||
|
||
def test_vision_mode_output(self, loader): | ||
"""Test output format in vision mode""" | ||
loader.set_vision_mode(True) | ||
pages = loader.load("test") | ||
assert pages[0]["image"] == [] | ||
|
||
loader.set_vision_mode(False) | ||
pages = loader.load("test") | ||
assert pages[0]["image"] is None | ||
|
||
def test_invalid_input(self, loader): | ||
"""Test error handling for invalid inputs""" | ||
with pytest.raises(ValueError): | ||
loader.load(123) # Invalid type | ||
|
||
with pytest.raises(ValueError): | ||
loader.load([{"wrong_key": "value"}]) # Missing content field | ||
|
||
with pytest.raises(ValueError): | ||
loader.load([123]) # Not a dict | ||
|
||
def test_caching(self, loader): | ||
"""Test that caching works""" | ||
test_input = "cache test" | ||
result1 = loader.load(test_input) | ||
result2 = loader.load(test_input) | ||
|
||
assert result1 == result2 | ||
assert id(result1) == id(result2) # Should be same object (cached) |