Merge pull request #201 from enoch3712/200-markitdown-refactor

markdown multi page fix
enoch3712 · Jan 17, 2025 · bd47a1c · bd47a1c
2 parents e11306b + 24167a8
commit bd47a1c
Show file tree

Hide file tree

Showing 2 changed files with 88 additions and 59 deletions.
diff --git a/extract_thinker/document_loader/document_loader_markitdown.py b/extract_thinker/document_loader/document_loader_markitdown.py
@@ -13,8 +13,9 @@
 
 @dataclass
 class MarkItDownConfig:
-    """Configuration for MarkItDown document loader.
-    
+    """
+    Configuration for MarkItDown document loader.
+
     Args:
         content: Initial content (optional)
         cache_ttl: Cache time-to-live in seconds (default: 300)
@@ -25,7 +26,6 @@ class MarkItDownConfig:
         page_separator: Character used to separate pages (default: form feed '\\f')
         preserve_whitespace: Whether to preserve whitespace in text (default: False)
     """
-    # Optional parameters
     content: Optional[Any] = None
     cache_ttl: int = 300
     llm_client: Optional[Any] = None
@@ -51,8 +51,11 @@ class DocumentLoaderMarkItDown(CachedDocumentLoader):
     """
     Document loader that uses MarkItDown to extract content from various file formats.
     Supports text extraction and optional image/page rendering in vision mode.
+    Produces a list of pages, each with:
+      - "content": text from that page
+      - "image": optional page/image bytes if vision_mode is True
     """
-    
+
     SUPPORTED_FORMATS = [
         "pdf", "doc", "docx", "ppt", "pptx", "xls", "xlsx", 
         "csv", "tsv", "txt", "html", "xml", "json", "zip",
@@ -70,26 +73,25 @@ def __init__(
         page_separator: str = '\f',
         preserve_whitespace: bool = False
     ):
-        """Initialize loader.
-        
+        """
+        Initialize the loader.
+
         Args:
-            content_or_config: Either a MarkItDownConfig object or initial content
+            content_or_config: Either a MarkItDownConfig object or the initial content
             cache_ttl: Cache time-to-live in seconds (only used if content_or_config is not MarkItDownConfig)
-            llm_client: LLM client for enhanced text processing (only used if content_or_config is not MarkItDownConfig)
-            llm_model: LLM model name to use (only used if content_or_config is not MarkItDownConfig)
-            mime_type_detection: Whether to use magic for MIME type detection (only used if content_or_config is not MarkItDownConfig)
-            default_extension: Default file extension when type cannot be determined (only used if content_or_config is not MarkItDownConfig)
-            page_separator: Character used to separate pages (only used if content_or_config is not MarkItDownConfig)
-            preserve_whitespace: Whether to preserve whitespace in text (only used if content_or_config is not MarkItDownConfig)
+            llm_client: LLM client (only used if content_or_config is not MarkItDownConfig)
+            llm_model: LLM model name (only used if content_or_config is not MarkItDownConfig)
+            mime_type_detection: Whether to use magic for MIME type detection
+            default_extension: Default extension if MIME type detection fails
+            page_separator: Character used to separate pages
+            preserve_whitespace: Whether to preserve whitespace
         """
-        # Check dependencies before initializing
         self._check_dependencies()
-        
-        # Handle both config-based and old-style initialization
+
+        # Handle config object vs. old-style params
         if isinstance(content_or_config, MarkItDownConfig):
             self.config = content_or_config
         else:
-            # Create config from individual parameters
             self.config = MarkItDownConfig(
                 content=content_or_config,
                 cache_ttl=cache_ttl,
@@ -100,8 +102,10 @@ def __init__(
                 page_separator=page_separator,
                 preserve_whitespace=preserve_whitespace
             )
-        
+
         super().__init__(self.config.content, self.config.cache_ttl)
+
+        # MarkItDown object
         self.markitdown = self._get_markitdown()(
             llm_client=self.config.llm_client,
             llm_model=self.config.llm_model
@@ -114,82 +118,87 @@ def _check_dependencies():
             import markitdown
         except ImportError:
             raise ImportError(
-                "Could not import markitdown package. "
+                "Could not import the 'markitdown' package. "
                 "Please install it with `pip install markitdown`."
             )
 
     def _get_markitdown(self):
-        """Lazy load MarkItDown."""
-        try:
-            from markitdown import MarkItDown
-            return MarkItDown
-        except ImportError:
-            raise ImportError(
-                "Could not import markitdown python package. "
-                "Please install it with `pip install markitdown`."
-            )
+        """Lazy-import MarkItDown class."""
+        from markitdown import MarkItDown
+        return MarkItDown
 
     def _process_text(self, text: str) -> str:
-        """Process text according to configuration."""
-        if not self.config.preserve_whitespace:
-            text = text.strip()
-        return text
+        """Apply any additional text processing (e.g., strip whitespace)."""
+        return text if self.config.preserve_whitespace else text.strip()
 
     @cachedmethod(cache=attrgetter('cache'), 
-                  key=lambda self, source: hashkey(source if isinstance(source, str) else source.getvalue(), self.vision_mode))
+                  key=lambda self, source: hashkey(
+                      source if isinstance(source, str) else source.getvalue(), 
+                      self.vision_mode
+                  ))
     def load(self, source: Union[str, BytesIO]) -> List[Dict[str, Any]]:
         """
-        Load and process content using MarkItDown.
-        Returns a list of pages, each containing:
-        - content: The text content
-        - image: The page/image bytes if vision_mode is True
-        
+        Load and process the source with MarkItDown, returning a list of pages.
+
         Args:
-            source: Either a file path or BytesIO stream
-            
+            source: A file path or a BytesIO stream
+
         Returns:
-            List[Dict[str, Any]]: List of pages with content and optional images
+            A list of dictionaries where each dict is one "page" of text.
+            - "content": The text content (str)
+            - "image": Optional bytes if vision mode is enabled (key only present if vision_mode is True)
         """
         if not self.can_handle(source):
             raise ValueError(f"Cannot handle source: {source}")
 
+        # Basic check for vision mode
         if self.vision_mode and not self.can_handle_vision(source):
             raise ValueError(f"Cannot handle source in vision mode: {source}")
 
         try:
-            # Extract text content using MarkItDown
+            # Convert the file or stream with MarkItDown
             if isinstance(source, str):
+                # File path
                 result = self.markitdown.convert(source)
             else:
-                # For BytesIO, we need to determine the file type
+                # BytesIO
                 source.seek(0)
                 if self.config.mime_type_detection:
                     mime = magic.from_buffer(source.getvalue(), mime=True)
-                    ext = next((ext for ext, mime_types in MIME_TYPE_MAPPING.items() 
-                              if mime in (mime_types if isinstance(mime_types, list) else [mime_types])), 
-                             self.config.default_extension)
+                    # Attempt to deduce extension from MIME type
+                    ext = next(
+                        (
+                            e
+                            for e, mime_list in MIME_TYPE_MAPPING.items()
+                            if mime in (mime_list if isinstance(mime_list, list) else [mime_list])
+                        ),
+                        self.config.default_extension
+                    )
                 else:
                     ext = self.config.default_extension
                 result = self.markitdown.convert_stream(source, file_extension=f".{ext}")
                 source.seek(0)
 
+            # Full text from MarkItDown
             text_content = result.text_content
+            if not text_content:
+                text_content = ""
 
-            # Split into pages if supported
-            pages = []
-            if self.can_handle_paginate(source):
-                raw_pages = text_content.split(self.config.page_separator)
-                for page_text in raw_pages:
-                    processed_text = self._process_text(page_text)
-                    if processed_text or self.config.preserve_whitespace:
-                        pages.append({"content": processed_text})
-            else:
-                processed_text = self._process_text(text_content)
-                pages = [{"content": processed_text}]
+            # Split text content into pages (based on config.page_separator)
+            raw_pages = text_content.split(self.config.page_separator)
 
-            # Add images in vision mode
+            pages = []
+            for page_text in raw_pages:
+                processed = self._process_text(page_text)
+                # Always include the page if preserve_whitespace is True, 
+                # or if there's any non-empty text.
+                if processed or self.config.preserve_whitespace:
+                    pages.append({"content": processed})
+
+            # In vision mode, attach rendered images if applicable
             if self.vision_mode:
                 images_dict = self.convert_to_images(source)
+                # Match up page images by index
                 for idx, page_dict in enumerate(pages):
                     if idx in images_dict:
                         page_dict["image"] = images_dict[idx]

diff --git a/tests/test_document_loader_markitdown.py b/tests/test_document_loader_markitdown.py
@@ -141,4 +141,24 @@ def test_simple_initialization(self, test_file_path):
             pages = loader.load(stream)
             assert len(pages) > 0
             assert isinstance(pages[0]["content"], str)
-
+
+
+def test_page_separator_splitting():
+    """
+    Test that multiple pages are correctly separated when loading a multi-page PDF.
+    Uses bulk.pdf which should contain 3 distinct pages.
+    """
+    # Get path to bulk.pdf test file
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    bulk_pdf_path = os.path.join(current_dir, 'files', 'Regional_GDP_per_capita_2018_2.pdf')
+
+            # Test without MIME type detection
+    config = MarkItDownConfig(
+        mime_type_detection=False,
+        default_extension='pdf'
+    )
+    loader = DocumentLoaderMarkItDown(config)
+    pages = loader.load(bulk_pdf_path)
+
+    # Verify we get exactly 3 pages
+    assert len(pages) == 2, f"Expected 2 pages, got {len(pages)}"