Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MD extra components added to docling DL #218

Merged
merged 1 commit into from
Jan 22, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 21 additions & 5 deletions extract_thinker/document_loader/document_loader_docling.py
Original file line number Diff line number Diff line change
@@ -254,15 +254,32 @@ def _extract_page_text(self, page: Any) -> str:
Gather text from a docling Page object.
Handles both text and table items.
"""

from docling_core.types.doc import DocItemLabel, TableItem

lines = []
if page.assembled and page.assembled.elements:
for element in page.assembled.elements:
# Normal text
if element.label in [DocItemLabel.TEXT, DocItemLabel.PARAGRAPH]:
# Titles
if element.label == DocItemLabel.TITLE:
lines.append(f"# {element.text or ''}")

# Section headers
elif element.label == DocItemLabel.SECTION_HEADER:
lines.append(f"## {element.text or ''}")

# Code blocks
elif element.label == DocItemLabel.CODE:
code_text = element.text or ""
lines.append(f"```\n{code_text}\n```")

# List items
elif element.label == DocItemLabel.LIST_ITEM:
lines.append(f"- {element.text or ''}")

# Normal text and paragraphs
elif element.label in [DocItemLabel.TEXT, DocItemLabel.PARAGRAPH]:
lines.append(element.text or "")

# Tables
elif element.label == DocItemLabel.TABLE and isinstance(element, TableItem):
table_text = self.convert_table_to_text(element)
@@ -296,5 +313,4 @@ def convert_table_to_text(self, table_item: Any) -> str:
else:
rows.append("| " + " | ".join(row_text) + " |")

return "\n".join(rows)

return "\n".join(rows)
Binary file not shown.
27 changes: 27 additions & 0 deletions tests/test_document_loader_docling.py
Original file line number Diff line number Diff line change
@@ -204,3 +204,30 @@ def test_custom_ocr_config(self, test_file_path):
assert isinstance(pages, list)
assert len(pages) > 0
assert "content" in pages[0]

def test_title_extraction(self):
"""
Test that a PDF with a recognized Title actually shows that Title
in the extracted text or markdown.
"""

loader = DocumentLoaderDocling()

# 1. Provide the path to your custom test file with a known Title
current_dir = os.path.dirname(os.path.abspath(__file__))
test_pdf_path = os.path.join(current_dir, 'files', 'fca-approach-payment-services-electronic-money-2017-5.pdf')

# 2. Load it
pages = loader.load(test_pdf_path)
assert pages, "No pages were returned from the PDF."

# 3. Inspect the text from the first page (or all pages)
page_text = pages[0]["content"] # or loop over pages if you prefer
assert isinstance(page_text, str), "Expected 'content' to be a string."

# 4. Check that your known Title text is present
# Suppose your PDF has "Document Title" as the Title.
assert "1 Introduction" in page_text, (
"Expected the recognized Title ('1 Introduction') "
"to appear in the extracted text."
)