Skip to content

Commit

Permalink
Merge pull request #236 from enoch3712/231-add-url-support-to-markitd…
Browse files Browse the repository at this point in the history
…own-and-docling-dl

url added for docling and markitdown DL
  • Loading branch information
enoch3712 authored Feb 3, 2025
2 parents fb431a3 + 3c44613 commit b9e4090
Show file tree
Hide file tree
Showing 6 changed files with 117 additions and 26 deletions.
31 changes: 31 additions & 0 deletions extract_thinker/document_loader/document_loader_docling.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,37 @@ def _init_docling_converter(self):
from docling.document_converter import DocumentConverter
return DocumentConverter()

def _is_url(self, potential_url: str) -> bool:
"""
Check if the given string is a URL.
Returns:
True if the string starts with "http://" or "https://", otherwise False.
"""
return potential_url.startswith("http://") or potential_url.startswith("https://")

def can_handle(self, source: Union[str, BytesIO]) -> bool:
"""
Determine if the loader can handle the given source.
This method now supports URLs, local file paths with supported extensions, and BytesIO.
Args:
source: The document source, which may be a string (file path or URL) or a BytesIO stream.
Returns:
True if the source is a valid input for the loader, else False.
"""
if isinstance(source, BytesIO):
return True
elif isinstance(source, str):
# If it's a URL, return True.
if self._is_url(source):
return True
# Otherwise, determine the file extension and check if it's supported.
extension = source.split('.')[-1].lower()
return extension in self.SUPPORTED_FORMATS
return False

@cachedmethod(cache=lambda self: self.cache,
key=lambda self, source: hashkey(
source if isinstance(source, str) else source.getvalue(),
Expand Down
34 changes: 32 additions & 2 deletions extract_thinker/document_loader/document_loader_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,15 @@ def _process_text(self, text: str) -> str:
"""Apply any additional text processing (e.g., strip whitespace)."""
return text if self.config.preserve_whitespace else text.strip()

def _is_url(self, source: str) -> bool:
"""Check if the source is a URL."""
try:
from urllib.parse import urlparse
result = urlparse(source)
return all([result.scheme, result.netloc])
except:
return False

@cachedmethod(cache=attrgetter('cache'),
key=lambda self, source: hashkey(
source if isinstance(source, str) else source.getvalue(),
Expand All @@ -141,7 +150,7 @@ def load(self, source: Union[str, BytesIO]) -> List[Dict[str, Any]]:
Load and process the source with MarkItDown, returning a list of pages.
Args:
source: A file path or a BytesIO stream
source: A file path, BytesIO stream, or URL
Returns:
A list of dictionaries where each dict is one "page" of text.
Expand Down Expand Up @@ -206,4 +215,25 @@ def load(self, source: Union[str, BytesIO]) -> List[Dict[str, Any]]:
return pages

except Exception as e:
raise ValueError(f"Error processing document with MarkItDown: {str(e)}")
raise ValueError(f"Error processing document with MarkItDown: {str(e)}")

def can_handle(self, source: Union[str, BytesIO]) -> bool:
"""
Checks if the loader can handle the given source.
Args:
source: Either a file path (str), a BytesIO stream, or a URL
Returns:
bool: True if the loader can handle the source, False otherwise
"""
try:
if isinstance(source, str):
if self._is_url(source):
return True
return self._can_handle_file_path(source)
elif isinstance(source, BytesIO):
return self._can_handle_stream(source)
return False
except Exception:
return False
2 changes: 1 addition & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "extract_thinker"
version = "0.1.4"
version = "0.1.5"
description = "Library to extract data from files and documents agnositicaly using LLMs"
authors = ["Júlio Almeida <[email protected]>"]
readme = "README.md"
Expand All @@ -9,7 +9,7 @@ readme = "README.md"
python = ">=3.9,<3.14"
pydantic = "^2.10.4"
litellm = "^1.57.0"
pillow = "^11.1.0"
pillow = ">=10.4,<12.0"
pypdfium2 = "^4.29.0"
instructor = "^1.7.2"
python-dotenv = "^1.0.1"
Expand Down
17 changes: 15 additions & 2 deletions tests/test_document_loader_docling.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,20 @@ def test_title_extraction(self):

# 4. Check that your known Title text is present
# Suppose your PDF has "Document Title" as the Title.
assert "1 Introduction" in page_text, (
assert "## 1 ntroduction" in page_text, (
"Expected the recognized Title ('1 Introduction') "
"to appear in the extracted text."
)
)

def test_url_loading(self, loader):
"""Test loading from a URL for Docling loader."""
url = "https://www.handbook.fca.org.uk/handbook/BCOBS/2/?view=chapter"
# Ensure the loader recognizes and can handle a URL
assert loader.can_handle(url) is True

pages = loader.load(url)
assert isinstance(pages, list)
assert len(pages) > 0
for page in pages:
assert "content" in page
assert isinstance(page["content"], str)
55 changes: 36 additions & 19 deletions tests/test_document_loader_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,23 +142,40 @@ def test_simple_initialization(self, test_file_path):
assert len(pages) > 0
assert isinstance(pages[0]["content"], str)

def test_page_separator_splitting(self):
"""
Test that multiple pages are correctly separated when loading a multi-page PDF.
Uses bulk.pdf which should contain 3 distinct pages.
"""
# Get path to bulk.pdf test file
current_dir = os.path.dirname(os.path.abspath(__file__))
bulk_pdf_path = os.path.join(current_dir, 'files', 'Regional_GDP_per_capita_2018_2.pdf')

# Test without MIME type detection
config = MarkItDownConfig(
mime_type_detection=False,
default_extension='pdf'
)
loader = DocumentLoaderMarkItDown(config)
pages = loader.load(bulk_pdf_path)

# Verify we get exactly 3 pages
assert len(pages) == 2, f"Expected 2 pages, got {len(pages)}"

def test_url_loading(self, loader):
"""Test loading from a URL for MarkItDown loader."""
url = "https://www.handbook.fca.org.uk/handbook/BCOBS/2/?view=chapter"
# Verify that the loader accepts the URL as a valid source.
assert loader.can_handle(url) is True

pages = loader.load(url)
assert isinstance(pages, list)
assert len(pages) > 0
for page in pages:
assert "content" in page
assert isinstance(page["content"], str)

def test_page_separator_splitting():
"""
Test that multiple pages are correctly separated when loading a multi-page PDF.
Uses bulk.pdf which should contain 3 distinct pages.
"""
# Get path to bulk.pdf test file
current_dir = os.path.dirname(os.path.abspath(__file__))
bulk_pdf_path = os.path.join(current_dir, 'files', 'Regional_GDP_per_capita_2018_2.pdf')

# Test without MIME type detection
config = MarkItDownConfig(
mime_type_detection=False,
default_extension='pdf'
)
loader = DocumentLoaderMarkItDown(config)
pages = loader.load(bulk_pdf_path)

# Verify we get exactly 3 pages
assert len(pages) == 2, f"Expected 2 pages, got {len(pages)}"
def test_can_handle_url(self, loader):
"""Test that MarkItDown loader correctly identifies URL sources."""
url = "https://www.handbook.fca.org.uk/handbook/BCOBS/2/?view=chapter"
assert loader.can_handle(url) is True

0 comments on commit b9e4090

Please sign in to comment.