Skip to content

Commit

Permalink
Merge pull request #218 from enoch3712/208-question-docling-text
Browse files Browse the repository at this point in the history
MD extra components added to docling DL
  • Loading branch information
enoch3712 authored Jan 22, 2025
2 parents e2cc594 + eedf1c4 commit 7e2a481
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 5 deletions.
26 changes: 21 additions & 5 deletions extract_thinker/document_loader/document_loader_docling.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,15 +254,32 @@ def _extract_page_text(self, page: Any) -> str:
Gather text from a docling Page object.
Handles both text and table items.
"""

from docling_core.types.doc import DocItemLabel, TableItem

lines = []
if page.assembled and page.assembled.elements:
for element in page.assembled.elements:
# Normal text
if element.label in [DocItemLabel.TEXT, DocItemLabel.PARAGRAPH]:
# Titles
if element.label == DocItemLabel.TITLE:
lines.append(f"# {element.text or ''}")

# Section headers
elif element.label == DocItemLabel.SECTION_HEADER:
lines.append(f"## {element.text or ''}")

# Code blocks
elif element.label == DocItemLabel.CODE:
code_text = element.text or ""
lines.append(f"```\n{code_text}\n```")

# List items
elif element.label == DocItemLabel.LIST_ITEM:
lines.append(f"- {element.text or ''}")

# Normal text and paragraphs
elif element.label in [DocItemLabel.TEXT, DocItemLabel.PARAGRAPH]:
lines.append(element.text or "")

# Tables
elif element.label == DocItemLabel.TABLE and isinstance(element, TableItem):
table_text = self.convert_table_to_text(element)
Expand Down Expand Up @@ -296,5 +313,4 @@ def convert_table_to_text(self, table_item: Any) -> str:
else:
rows.append("| " + " | ".join(row_text) + " |")

return "\n".join(rows)

return "\n".join(rows)
Binary file not shown.
27 changes: 27 additions & 0 deletions tests/test_document_loader_docling.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,3 +204,30 @@ def test_custom_ocr_config(self, test_file_path):
assert isinstance(pages, list)
assert len(pages) > 0
assert "content" in pages[0]

def test_title_extraction(self):
"""
Test that a PDF with a recognized Title actually shows that Title
in the extracted text or markdown.
"""

loader = DocumentLoaderDocling()

# 1. Provide the path to your custom test file with a known Title
current_dir = os.path.dirname(os.path.abspath(__file__))
test_pdf_path = os.path.join(current_dir, 'files', 'fca-approach-payment-services-electronic-money-2017-5.pdf')

# 2. Load it
pages = loader.load(test_pdf_path)
assert pages, "No pages were returned from the PDF."

# 3. Inspect the text from the first page (or all pages)
page_text = pages[0]["content"] # or loop over pages if you prefer
assert isinstance(page_text, str), "Expected 'content' to be a string."

# 4. Check that your known Title text is present
# Suppose your PDF has "Document Title" as the Title.
assert "1 Introduction" in page_text, (
"Expected the recognized Title ('1 Introduction') "
"to appear in the extracted text."
)

0 comments on commit 7e2a481

Please sign in to comment.