Merge pull request #212 from enoch3712/211-new-feature-extract-from-s…

…tring Document Loader for already processed content
enoch3712 · Jan 21, 2025 · 7b2c051 · 7b2c051
2 parents 485545c + c43c5d3
commit 7b2c051
Showing 3 changed files with 332 additions and 0 deletions.
diff --git a/docs/core-concepts/document-loaders/data.md b/docs/core-concepts/document-loaders/data.md
@@ -0,0 +1,92 @@
+# Data Document Loader
+
+The Data loader is a specialized loader that handles pre-processed data in a standardized format. It provides caching support and vision mode compatibility.
+
+## Supported Format
+
+The loader expects data in the following standard format:
+```python
+[
+  {
+    "content": "...some text...",
+    "image": None or [] or bytes
+  }
+]
+```
+
+## Usage
+
+### Basic Usage
+
+```python
+from extract_thinker import DocumentLoaderData
+
+# Initialize with default settings
+loader = DocumentLoaderData()
+
+# Load pre-formatted data
+data = [{"content": "Sample text", "image": None}]
+pages = loader.load(data)
+
+# Process content
+for page in pages:
+    # Access text content
+    text = page["content"]
+    # Access image data if present
+    image = page["image"]
+```
+
+### Configuration-based Usage
+
+```python
+from extract_thinker import DocumentLoaderData, DataLoaderConfig
+
+# Create configuration
+config = DataLoaderConfig(
+    content=None,                # Initial content
+    cache_ttl=600,              # Cache results for 10 minutes
+    supports_vision=True         # Enable vision support
+)
+
+# Initialize loader with configuration
+loader = DocumentLoaderData(config)
+
+# Load and process content
+pages = loader.load("raw text content")
+```
+
+## Configuration Options
+
+The `DataLoaderConfig` class supports the following options:
+
+| Option | Type | Default | Description |
+|--------|------|---------|-------------|
+| `content` | Any | None | Initial content to process |
+| `cache_ttl` | int | 300 | Cache time-to-live in seconds |
+| `supports_vision` | bool | True | Whether vision mode is supported |
+
+## Features
+
+- Handles pre-processed data in standard format
+- Supports raw text input
+- File and stream processing
+- Vision mode support
+- Automatic format validation
+- Caching support
+- Flexible input handling
+
+## Input Types
+
+The loader can handle:
+- Pre-formatted list of dictionaries
+- Raw text strings
+- File paths
+- IO streams
+
+## Notes
+
+- Vision mode is configurable
+- Validates input format automatically
+- Caches results based on content hash
+- Supports both file and stream-based loading
+- Preserves image data when in vision mode
diff --git a/extract_thinker/document_loader/document_loader_data.py b/extract_thinker/document_loader/document_loader_data.py
@@ -0,0 +1,144 @@
+from typing import Any, Dict, List, Union, Optional, IO
+from cachetools import cachedmethod
+from cachetools.keys import hashkey
+from operator import attrgetter
+from extract_thinker.document_loader.cached_document_loader import CachedDocumentLoader
+from dataclasses import dataclass
+
+@dataclass
+class DataLoaderConfig:
+    """Configuration for Data loader.
+    
+    Args:
+        content: Initial content (optional)
+        cache_ttl: Cache time-to-live in seconds (default: 300)
+        supports_vision: Whether this loader supports vision mode (default: True)
+    """
+    content: Optional[Any] = None
+    cache_ttl: int = 300
+    supports_vision: bool = True
+
+class DocumentLoaderData(CachedDocumentLoader):
+    """
+    Document loader that handles pre-processed data with caching support.
+    Expects data in standard format:
+    [
+      {
+        "content": "...some text...",
+        "image": None or [] or bytes
+      }
+    ]
+    """
+
+    def __init__(self, 
+                 content: Optional[Any] = None,
+                 cache_ttl: int = 300,
+                 supports_vision: bool = True):
+        """Initialize loader with optional content and cache settings."""
+        self.config = DataLoaderConfig(
+            content=content,
+            cache_ttl=cache_ttl,
+            supports_vision=supports_vision
+        )
+        super().__init__(self.config.content, self.config.cache_ttl)
+        self._supports_vision = self.config.supports_vision
+
+    def can_handle(self, source: Any) -> bool:
+        """Check if we can handle this source type."""
+        if isinstance(source, str):
+            return True
+        if hasattr(source, "read"):
+            return True
+        if isinstance(source, list) and all(isinstance(item, dict) for item in source):
+            return True
+        return False
+
+    @cachedmethod(cache=attrgetter('cache'), 
+                  key=lambda self, source: hashkey(
+                      source if isinstance(source, str)
+                      else source.getvalue() if hasattr(source, 'getvalue')
+                      else str(source),
+                      self.vision_mode))
+    def load(self, source: Union[str, IO, List[Dict[str, Any]]]) -> List[Dict[str, Any]]:
+        """
+        Load and process content with caching support.
+        Returns a list of pages in standard format.
+        
+        Args:
+            source: String, IO stream, or pre-formatted list of dicts
+            
+        Returns:
+            List[Dict[str, Any]]: List of pages with content and image
+        """
+        if not self.can_handle(source):
+            raise ValueError("Can only handle str, readable streams, or list of dicts")
+
+        try:
+            if isinstance(source, list):
+                return self._validate_and_format_list(source)
+            elif isinstance(source, str):
+                return self._load_from_string(source)
+            elif hasattr(source, "read"):
+                return self._load_from_stream(source)
+
+        except Exception as e:
+            raise ValueError(f"Error processing content: {str(e)}")
+
+    def _validate_and_format_list(self, data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """Validate and format a list of dictionaries."""
+        formatted_pages = []
+        for item in data:
+            if not isinstance(item, dict):
+                raise ValueError(
+                    "Invalid format. Expected a list of dictionaries in format: "
+                    "[{'content': 'your text here', 'image': None or [] or bytes}, ...]. "
+                    f"Got item of type {type(item).__name__} instead of dict"
+                )
+            if "content" not in item:
+                raise ValueError(
+                    "Invalid format. Each dictionary must have a 'content' field. "
+                    "Expected format: [{'content': 'your text here', 'image': None or [] or bytes}, ...]. "
+                    f"Got keys: {list(item.keys())}"
+                )
+
+            # Preserve the original image value if present, otherwise use vision mode default
+            image_value = item.get("image", [] if self.vision_mode else None)
+            page = {
+                "content": item["content"],
+                "image": image_value
+            }
+            formatted_pages.append(page)
+        return formatted_pages
+
+    def _load_from_string(self, text: str) -> List[Dict[str, Any]]:
+        """Process string input."""
+        try:
+            # Try to read as file first
+            with open(text, "r", encoding="utf-8") as f:
+                content = f.read()
+        except (FileNotFoundError, IOError):
+            # If not a file, treat as raw text
+            content = text
+
+        return [{
+            "content": content,
+            "image": [] if self.vision_mode else None
+        }]
+
+    def _load_from_stream(self, stream: IO) -> List[Dict[str, Any]]:
+        """Process stream input."""
+        try:
+            content = stream.read()
+            if isinstance(content, bytes):
+                content = content.decode('utf-8')
+
+            return [{
+                "content": content,
+                "image": [] if self.vision_mode else None
+            }]
+        except Exception as e:
+            raise ValueError(f"Failed to read from stream: {str(e)}")
+
+    def can_handle_vision(self, source: Union[str, IO]) -> bool:
+        """Check if this loader can handle the source in vision mode."""
+        return self._supports_vision and self.can_handle(source) 
diff --git a/tests/test_document_loader_data.py b/tests/test_document_loader_data.py
@@ -0,0 +1,96 @@
+import pytest
+from io import StringIO, BytesIO
+from extract_thinker.document_loader.document_loader_data import DocumentLoaderData, DataLoaderConfig
+
+class TestDocumentLoaderData:
+    @pytest.fixture
+    def data_config(self):
+        """Default Data configuration for testing"""
+        return DataLoaderConfig(
+            cache_ttl=300,
+            supports_vision=True
+        )
+
+    @pytest.fixture
+    def loader(self, data_config):
+        return DocumentLoaderData(
+            cache_ttl=data_config.cache_ttl,
+            supports_vision=data_config.supports_vision
+        )
+
+    @pytest.fixture
+    def test_data(self):
+        return [{
+            "content": "Sample text",
+            "image": None
+        }]
+
+    def test_preformatted_data(self, loader, test_data):
+        """Test handling of pre-formatted data"""
+        pages = loader.load(test_data)
+
+        assert isinstance(pages, list)
+        assert len(pages) == 1
+        assert pages[0]["content"] == "Sample text"
+        assert "image" in pages[0]
+        assert pages[0]["image"] is None
+
+    def test_vision_support(self, loader):
+        """Test vision mode handling"""
+        # Vision mode should be configurable
+        assert loader._supports_vision is True
+        assert loader.can_handle_vision("test") is True
+
+        # Create loader with vision disabled
+        no_vision_loader = DocumentLoaderData(supports_vision=False)
+        assert no_vision_loader._supports_vision is False
+        assert no_vision_loader.can_handle_vision("test") is False
+
+    def test_string_input(self, loader):
+        """Test handling of string input"""
+        text = "Hello world"
+        pages = loader.load(text)
+
+        assert len(pages) == 1
+        assert pages[0]["content"] == text
+        assert pages[0]["image"] is None
+
+    def test_stream_input(self, loader):
+        """Test handling of stream input"""
+        text = "Stream content"
+        stream = StringIO(text)
+        pages = loader.load(stream)
+
+        assert len(pages) == 1
+        assert pages[0]["content"] == text
+        assert pages[0]["image"] is None
+
+    def test_vision_mode_output(self, loader):
+        """Test output format in vision mode"""
+        loader.set_vision_mode(True)
+        pages = loader.load("test")
+        assert pages[0]["image"] == []
+
+        loader.set_vision_mode(False)
+        pages = loader.load("test")
+        assert pages[0]["image"] is None
+
+    def test_invalid_input(self, loader):
+        """Test error handling for invalid inputs"""
+        with pytest.raises(ValueError):
+            loader.load(123)  # Invalid type
+
+        with pytest.raises(ValueError):
+            loader.load([{"wrong_key": "value"}])  # Missing content field
+
+        with pytest.raises(ValueError):
+            loader.load([123])  # Not a dict
+
+    def test_caching(self, loader):
+        """Test that caching works"""
+        test_input = "cache test"
+        result1 = loader.load(test_input)
+        result2 = loader.load(test_input)
+
+        assert result1 == result2
+        assert id(result1) == id(result2)  # Should be same object (cached)