Skip to content

Commit

Permalink
Merge pull request #201 from enoch3712/200-markitdown-refactor
Browse files Browse the repository at this point in the history
markdown multi page fix
  • Loading branch information
enoch3712 authored Jan 17, 2025
2 parents e11306b + 24167a8 commit bd47a1c
Show file tree
Hide file tree
Showing 2 changed files with 88 additions and 59 deletions.
125 changes: 67 additions & 58 deletions extract_thinker/document_loader/document_loader_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,9 @@

@dataclass
class MarkItDownConfig:
"""Configuration for MarkItDown document loader.
"""
Configuration for MarkItDown document loader.
Args:
content: Initial content (optional)
cache_ttl: Cache time-to-live in seconds (default: 300)
Expand All @@ -25,7 +26,6 @@ class MarkItDownConfig:
page_separator: Character used to separate pages (default: form feed '\\f')
preserve_whitespace: Whether to preserve whitespace in text (default: False)
"""
# Optional parameters
content: Optional[Any] = None
cache_ttl: int = 300
llm_client: Optional[Any] = None
Expand All @@ -51,8 +51,11 @@ class DocumentLoaderMarkItDown(CachedDocumentLoader):
"""
Document loader that uses MarkItDown to extract content from various file formats.
Supports text extraction and optional image/page rendering in vision mode.
Produces a list of pages, each with:
- "content": text from that page
- "image": optional page/image bytes if vision_mode is True
"""

SUPPORTED_FORMATS = [
"pdf", "doc", "docx", "ppt", "pptx", "xls", "xlsx",
"csv", "tsv", "txt", "html", "xml", "json", "zip",
Expand All @@ -70,26 +73,25 @@ def __init__(
page_separator: str = '\f',
preserve_whitespace: bool = False
):
"""Initialize loader.
"""
Initialize the loader.
Args:
content_or_config: Either a MarkItDownConfig object or initial content
content_or_config: Either a MarkItDownConfig object or the initial content
cache_ttl: Cache time-to-live in seconds (only used if content_or_config is not MarkItDownConfig)
llm_client: LLM client for enhanced text processing (only used if content_or_config is not MarkItDownConfig)
llm_model: LLM model name to use (only used if content_or_config is not MarkItDownConfig)
mime_type_detection: Whether to use magic for MIME type detection (only used if content_or_config is not MarkItDownConfig)
default_extension: Default file extension when type cannot be determined (only used if content_or_config is not MarkItDownConfig)
page_separator: Character used to separate pages (only used if content_or_config is not MarkItDownConfig)
preserve_whitespace: Whether to preserve whitespace in text (only used if content_or_config is not MarkItDownConfig)
llm_client: LLM client (only used if content_or_config is not MarkItDownConfig)
llm_model: LLM model name (only used if content_or_config is not MarkItDownConfig)
mime_type_detection: Whether to use magic for MIME type detection
default_extension: Default extension if MIME type detection fails
page_separator: Character used to separate pages
preserve_whitespace: Whether to preserve whitespace
"""
# Check dependencies before initializing
self._check_dependencies()
# Handle both config-based and old-style initialization

# Handle config object vs. old-style params
if isinstance(content_or_config, MarkItDownConfig):
self.config = content_or_config
else:
# Create config from individual parameters
self.config = MarkItDownConfig(
content=content_or_config,
cache_ttl=cache_ttl,
Expand All @@ -100,8 +102,10 @@ def __init__(
page_separator=page_separator,
preserve_whitespace=preserve_whitespace
)

super().__init__(self.config.content, self.config.cache_ttl)

# MarkItDown object
self.markitdown = self._get_markitdown()(
llm_client=self.config.llm_client,
llm_model=self.config.llm_model
Expand All @@ -114,82 +118,87 @@ def _check_dependencies():
import markitdown
except ImportError:
raise ImportError(
"Could not import markitdown package. "
"Could not import the 'markitdown' package. "
"Please install it with `pip install markitdown`."
)

def _get_markitdown(self):
"""Lazy load MarkItDown."""
try:
from markitdown import MarkItDown
return MarkItDown
except ImportError:
raise ImportError(
"Could not import markitdown python package. "
"Please install it with `pip install markitdown`."
)
"""Lazy-import MarkItDown class."""
from markitdown import MarkItDown
return MarkItDown

def _process_text(self, text: str) -> str:
"""Process text according to configuration."""
if not self.config.preserve_whitespace:
text = text.strip()
return text
"""Apply any additional text processing (e.g., strip whitespace)."""
return text if self.config.preserve_whitespace else text.strip()

@cachedmethod(cache=attrgetter('cache'),
key=lambda self, source: hashkey(source if isinstance(source, str) else source.getvalue(), self.vision_mode))
key=lambda self, source: hashkey(
source if isinstance(source, str) else source.getvalue(),
self.vision_mode
))
def load(self, source: Union[str, BytesIO]) -> List[Dict[str, Any]]:
"""
Load and process content using MarkItDown.
Returns a list of pages, each containing:
- content: The text content
- image: The page/image bytes if vision_mode is True
Load and process the source with MarkItDown, returning a list of pages.
Args:
source: Either a file path or BytesIO stream
source: A file path or a BytesIO stream
Returns:
List[Dict[str, Any]]: List of pages with content and optional images
A list of dictionaries where each dict is one "page" of text.
- "content": The text content (str)
- "image": Optional bytes if vision mode is enabled (key only present if vision_mode is True)
"""
if not self.can_handle(source):
raise ValueError(f"Cannot handle source: {source}")

# Basic check for vision mode
if self.vision_mode and not self.can_handle_vision(source):
raise ValueError(f"Cannot handle source in vision mode: {source}")

try:
# Extract text content using MarkItDown
# Convert the file or stream with MarkItDown
if isinstance(source, str):
# File path
result = self.markitdown.convert(source)
else:
# For BytesIO, we need to determine the file type
# BytesIO
source.seek(0)
if self.config.mime_type_detection:
mime = magic.from_buffer(source.getvalue(), mime=True)
ext = next((ext for ext, mime_types in MIME_TYPE_MAPPING.items()
if mime in (mime_types if isinstance(mime_types, list) else [mime_types])),
self.config.default_extension)
# Attempt to deduce extension from MIME type
ext = next(
(
e
for e, mime_list in MIME_TYPE_MAPPING.items()
if mime in (mime_list if isinstance(mime_list, list) else [mime_list])
),
self.config.default_extension
)
else:
ext = self.config.default_extension
result = self.markitdown.convert_stream(source, file_extension=f".{ext}")
source.seek(0)

# Full text from MarkItDown
text_content = result.text_content
if not text_content:
text_content = ""

# Split into pages if supported
pages = []
if self.can_handle_paginate(source):
raw_pages = text_content.split(self.config.page_separator)
for page_text in raw_pages:
processed_text = self._process_text(page_text)
if processed_text or self.config.preserve_whitespace:
pages.append({"content": processed_text})
else:
processed_text = self._process_text(text_content)
pages = [{"content": processed_text}]
# Split text content into pages (based on config.page_separator)
raw_pages = text_content.split(self.config.page_separator)

# Add images in vision mode
pages = []
for page_text in raw_pages:
processed = self._process_text(page_text)
# Always include the page if preserve_whitespace is True,
# or if there's any non-empty text.
if processed or self.config.preserve_whitespace:
pages.append({"content": processed})

# In vision mode, attach rendered images if applicable
if self.vision_mode:
images_dict = self.convert_to_images(source)
# Match up page images by index
for idx, page_dict in enumerate(pages):
if idx in images_dict:
page_dict["image"] = images_dict[idx]
Expand Down
22 changes: 21 additions & 1 deletion tests/test_document_loader_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,4 +141,24 @@ def test_simple_initialization(self, test_file_path):
pages = loader.load(stream)
assert len(pages) > 0
assert isinstance(pages[0]["content"], str)



def test_page_separator_splitting():
"""
Test that multiple pages are correctly separated when loading a multi-page PDF.
Uses bulk.pdf which should contain 3 distinct pages.
"""
# Get path to bulk.pdf test file
current_dir = os.path.dirname(os.path.abspath(__file__))
bulk_pdf_path = os.path.join(current_dir, 'files', 'Regional_GDP_per_capita_2018_2.pdf')

# Test without MIME type detection
config = MarkItDownConfig(
mime_type_detection=False,
default_extension='pdf'
)
loader = DocumentLoaderMarkItDown(config)
pages = loader.load(bulk_pdf_path)

# Verify we get exactly 3 pages
assert len(pages) == 2, f"Expected 2 pages, got {len(pages)}"

0 comments on commit bd47a1c

Please sign in to comment.