Skip to content

Commit

Permalink
Merge pull request #212 from enoch3712/211-new-feature-extract-from-s…
Browse files Browse the repository at this point in the history
…tring

Document Loader for already processed content
enoch3712 authored Jan 21, 2025

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
2 parents 485545c + c43c5d3 commit 7b2c051
Showing 3 changed files with 332 additions and 0 deletions.
92 changes: 92 additions & 0 deletions docs/core-concepts/document-loaders/data.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
# Data Document Loader

The Data loader is a specialized loader that handles pre-processed data in a standardized format. It provides caching support and vision mode compatibility.

## Supported Format

The loader expects data in the following standard format:
```python
[
{
"content": "...some text...",
"image": None or [] or bytes
}
]
```

## Usage

### Basic Usage

```python
from extract_thinker import DocumentLoaderData

# Initialize with default settings
loader = DocumentLoaderData()

# Load pre-formatted data
data = [{"content": "Sample text", "image": None}]
pages = loader.load(data)

# Process content
for page in pages:
# Access text content
text = page["content"]
# Access image data if present
image = page["image"]
```

### Configuration-based Usage

```python
from extract_thinker import DocumentLoaderData, DataLoaderConfig

# Create configuration
config = DataLoaderConfig(
content=None, # Initial content
cache_ttl=600, # Cache results for 10 minutes
supports_vision=True # Enable vision support
)

# Initialize loader with configuration
loader = DocumentLoaderData(config)

# Load and process content
pages = loader.load("raw text content")
```

## Configuration Options

The `DataLoaderConfig` class supports the following options:

| Option | Type | Default | Description |
|--------|------|---------|-------------|
| `content` | Any | None | Initial content to process |
| `cache_ttl` | int | 300 | Cache time-to-live in seconds |
| `supports_vision` | bool | True | Whether vision mode is supported |

## Features

- Handles pre-processed data in standard format
- Supports raw text input
- File and stream processing
- Vision mode support
- Automatic format validation
- Caching support
- Flexible input handling

## Input Types

The loader can handle:
- Pre-formatted list of dictionaries
- Raw text strings
- File paths
- IO streams

## Notes

- Vision mode is configurable
- Validates input format automatically
- Caches results based on content hash
- Supports both file and stream-based loading
- Preserves image data when in vision mode
144 changes: 144 additions & 0 deletions extract_thinker/document_loader/document_loader_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
from typing import Any, Dict, List, Union, Optional, IO
from cachetools import cachedmethod
from cachetools.keys import hashkey
from operator import attrgetter
from extract_thinker.document_loader.cached_document_loader import CachedDocumentLoader
from dataclasses import dataclass

@dataclass
class DataLoaderConfig:
"""Configuration for Data loader.
Args:
content: Initial content (optional)
cache_ttl: Cache time-to-live in seconds (default: 300)
supports_vision: Whether this loader supports vision mode (default: True)
"""
content: Optional[Any] = None
cache_ttl: int = 300
supports_vision: bool = True

class DocumentLoaderData(CachedDocumentLoader):
"""
Document loader that handles pre-processed data with caching support.
Expects data in standard format:
[
{
"content": "...some text...",
"image": None or [] or bytes
}
]
"""

def __init__(self,
content: Optional[Any] = None,
cache_ttl: int = 300,
supports_vision: bool = True):
"""Initialize loader with optional content and cache settings."""
self.config = DataLoaderConfig(
content=content,
cache_ttl=cache_ttl,
supports_vision=supports_vision
)
super().__init__(self.config.content, self.config.cache_ttl)
self._supports_vision = self.config.supports_vision

def can_handle(self, source: Any) -> bool:
"""Check if we can handle this source type."""
if isinstance(source, str):
return True
if hasattr(source, "read"):
return True
if isinstance(source, list) and all(isinstance(item, dict) for item in source):
return True
return False

@cachedmethod(cache=attrgetter('cache'),
key=lambda self, source: hashkey(
source if isinstance(source, str)
else source.getvalue() if hasattr(source, 'getvalue')
else str(source),
self.vision_mode))
def load(self, source: Union[str, IO, List[Dict[str, Any]]]) -> List[Dict[str, Any]]:
"""
Load and process content with caching support.
Returns a list of pages in standard format.
Args:
source: String, IO stream, or pre-formatted list of dicts
Returns:
List[Dict[str, Any]]: List of pages with content and image
"""
if not self.can_handle(source):
raise ValueError("Can only handle str, readable streams, or list of dicts")

try:
if isinstance(source, list):
return self._validate_and_format_list(source)
elif isinstance(source, str):
return self._load_from_string(source)
elif hasattr(source, "read"):
return self._load_from_stream(source)

except Exception as e:
raise ValueError(f"Error processing content: {str(e)}")

def _validate_and_format_list(self, data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Validate and format a list of dictionaries."""
formatted_pages = []
for item in data:
if not isinstance(item, dict):
raise ValueError(
"Invalid format. Expected a list of dictionaries in format: "
"[{'content': 'your text here', 'image': None or [] or bytes}, ...]. "
f"Got item of type {type(item).__name__} instead of dict"
)
if "content" not in item:
raise ValueError(
"Invalid format. Each dictionary must have a 'content' field. "
"Expected format: [{'content': 'your text here', 'image': None or [] or bytes}, ...]. "
f"Got keys: {list(item.keys())}"
)

# Preserve the original image value if present, otherwise use vision mode default
image_value = item.get("image", [] if self.vision_mode else None)
page = {
"content": item["content"],
"image": image_value
}
formatted_pages.append(page)
return formatted_pages

def _load_from_string(self, text: str) -> List[Dict[str, Any]]:
"""Process string input."""
try:
# Try to read as file first
with open(text, "r", encoding="utf-8") as f:
content = f.read()
except (FileNotFoundError, IOError):
# If not a file, treat as raw text
content = text

return [{
"content": content,
"image": [] if self.vision_mode else None
}]

def _load_from_stream(self, stream: IO) -> List[Dict[str, Any]]:
"""Process stream input."""
try:
content = stream.read()
if isinstance(content, bytes):
content = content.decode('utf-8')

return [{
"content": content,
"image": [] if self.vision_mode else None
}]
except Exception as e:
raise ValueError(f"Failed to read from stream: {str(e)}")

def can_handle_vision(self, source: Union[str, IO]) -> bool:
"""Check if this loader can handle the source in vision mode."""
return self._supports_vision and self.can_handle(source)
96 changes: 96 additions & 0 deletions tests/test_document_loader_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import pytest
from io import StringIO, BytesIO
from extract_thinker.document_loader.document_loader_data import DocumentLoaderData, DataLoaderConfig

class TestDocumentLoaderData:
@pytest.fixture
def data_config(self):
"""Default Data configuration for testing"""
return DataLoaderConfig(
cache_ttl=300,
supports_vision=True
)

@pytest.fixture
def loader(self, data_config):
return DocumentLoaderData(
cache_ttl=data_config.cache_ttl,
supports_vision=data_config.supports_vision
)

@pytest.fixture
def test_data(self):
return [{
"content": "Sample text",
"image": None
}]

def test_preformatted_data(self, loader, test_data):
"""Test handling of pre-formatted data"""
pages = loader.load(test_data)

assert isinstance(pages, list)
assert len(pages) == 1
assert pages[0]["content"] == "Sample text"
assert "image" in pages[0]
assert pages[0]["image"] is None

def test_vision_support(self, loader):
"""Test vision mode handling"""
# Vision mode should be configurable
assert loader._supports_vision is True
assert loader.can_handle_vision("test") is True

# Create loader with vision disabled
no_vision_loader = DocumentLoaderData(supports_vision=False)
assert no_vision_loader._supports_vision is False
assert no_vision_loader.can_handle_vision("test") is False

def test_string_input(self, loader):
"""Test handling of string input"""
text = "Hello world"
pages = loader.load(text)

assert len(pages) == 1
assert pages[0]["content"] == text
assert pages[0]["image"] is None

def test_stream_input(self, loader):
"""Test handling of stream input"""
text = "Stream content"
stream = StringIO(text)
pages = loader.load(stream)

assert len(pages) == 1
assert pages[0]["content"] == text
assert pages[0]["image"] is None

def test_vision_mode_output(self, loader):
"""Test output format in vision mode"""
loader.set_vision_mode(True)
pages = loader.load("test")
assert pages[0]["image"] == []

loader.set_vision_mode(False)
pages = loader.load("test")
assert pages[0]["image"] is None

def test_invalid_input(self, loader):
"""Test error handling for invalid inputs"""
with pytest.raises(ValueError):
loader.load(123) # Invalid type

with pytest.raises(ValueError):
loader.load([{"wrong_key": "value"}]) # Missing content field

with pytest.raises(ValueError):
loader.load([123]) # Not a dict

def test_caching(self, loader):
"""Test that caching works"""
test_input = "cache test"
result1 = loader.load(test_input)
result2 = loader.load(test_input)

assert result1 == result2
assert id(result1) == id(result2) # Should be same object (cached)

0 comments on commit 7b2c051

Please sign in to comment.