Merge pull request #218 from enoch3712/208-question-docling-text

MD extra components added to docling DL
enoch3712 · Jan 22, 2025 · 7e2a481 · 7e2a481
2 parents e2cc594 + eedf1c4
commit 7e2a481
Show file tree

Hide file tree

Showing 3 changed files with 48 additions and 5 deletions.
diff --git a/extract_thinker/document_loader/document_loader_docling.py b/extract_thinker/document_loader/document_loader_docling.py
@@ -254,15 +254,32 @@ def _extract_page_text(self, page: Any) -> str:
         Gather text from a docling Page object. 
         Handles both text and table items.
         """
-
         from docling_core.types.doc import DocItemLabel, TableItem
 
         lines = []
         if page.assembled and page.assembled.elements:
             for element in page.assembled.elements:
-                # Normal text
-                if element.label in [DocItemLabel.TEXT, DocItemLabel.PARAGRAPH]:
+                # Titles
+                if element.label == DocItemLabel.TITLE:
+                    lines.append(f"# {element.text or ''}")
+
+                # Section headers
+                elif element.label == DocItemLabel.SECTION_HEADER:
+                    lines.append(f"## {element.text or ''}")
+
+                # Code blocks
+                elif element.label == DocItemLabel.CODE:
+                    code_text = element.text or ""
+                    lines.append(f"```\n{code_text}\n```")
+
+                # List items
+                elif element.label == DocItemLabel.LIST_ITEM:
+                    lines.append(f"- {element.text or ''}")
+
+                # Normal text and paragraphs
+                elif element.label in [DocItemLabel.TEXT, DocItemLabel.PARAGRAPH]:
                     lines.append(element.text or "")
+
                 # Tables
                 elif element.label == DocItemLabel.TABLE and isinstance(element, TableItem):
                     table_text = self.convert_table_to_text(element)
@@ -296,5 +313,4 @@ def convert_table_to_text(self, table_item: Any) -> str:
             else:
                 rows.append("| " + " | ".join(row_text) + " |")
 
-        return "\n".join(rows)
-
+        return "\n".join(rows)
diff --git a/tests/files/fca-approach-payment-services-electronic-money-2017-5.pdf b/tests/files/fca-approach-payment-services-electronic-money-2017-5.pdf
diff --git a/tests/test_document_loader_docling.py b/tests/test_document_loader_docling.py
@@ -204,3 +204,30 @@ def test_custom_ocr_config(self, test_file_path):
         assert isinstance(pages, list)
         assert len(pages) > 0
         assert "content" in pages[0]
+
+    def test_title_extraction(self):
+        """
+        Test that a PDF with a recognized Title actually shows that Title
+        in the extracted text or markdown.
+        """
+
+        loader = DocumentLoaderDocling()
+
+        # 1. Provide the path to your custom test file with a known Title
+        current_dir = os.path.dirname(os.path.abspath(__file__))
+        test_pdf_path = os.path.join(current_dir, 'files', 'fca-approach-payment-services-electronic-money-2017-5.pdf')
+
+        # 2. Load it
+        pages = loader.load(test_pdf_path)
+        assert pages, "No pages were returned from the PDF."
+
+        # 3. Inspect the text from the first page (or all pages)
+        page_text = pages[0]["content"]  # or loop over pages if you prefer
+        assert isinstance(page_text, str), "Expected 'content' to be a string."
+
+        # 4. Check that your known Title text is present
+        #    Suppose your PDF has "Document Title" as the Title.
+        assert "1 Introduction" in page_text, (
+            "Expected the recognized Title ('1 Introduction') "
+            "to appear in the extracted text."
+        )