diff --git a/extract_thinker/document_loader/document_loader_docling.py b/extract_thinker/document_loader/document_loader_docling.py index 6d01e9b..e2a3212 100644 --- a/extract_thinker/document_loader/document_loader_docling.py +++ b/extract_thinker/document_loader/document_loader_docling.py @@ -254,15 +254,32 @@ def _extract_page_text(self, page: Any) -> str: Gather text from a docling Page object. Handles both text and table items. """ - from docling_core.types.doc import DocItemLabel, TableItem lines = [] if page.assembled and page.assembled.elements: for element in page.assembled.elements: - # Normal text - if element.label in [DocItemLabel.TEXT, DocItemLabel.PARAGRAPH]: + # Titles + if element.label == DocItemLabel.TITLE: + lines.append(f"# {element.text or ''}") + + # Section headers + elif element.label == DocItemLabel.SECTION_HEADER: + lines.append(f"## {element.text or ''}") + + # Code blocks + elif element.label == DocItemLabel.CODE: + code_text = element.text or "" + lines.append(f"```\n{code_text}\n```") + + # List items + elif element.label == DocItemLabel.LIST_ITEM: + lines.append(f"- {element.text or ''}") + + # Normal text and paragraphs + elif element.label in [DocItemLabel.TEXT, DocItemLabel.PARAGRAPH]: lines.append(element.text or "") + # Tables elif element.label == DocItemLabel.TABLE and isinstance(element, TableItem): table_text = self.convert_table_to_text(element) @@ -296,5 +313,4 @@ def convert_table_to_text(self, table_item: Any) -> str: else: rows.append("| " + " | ".join(row_text) + " |") - return "\n".join(rows) - + return "\n".join(rows) \ No newline at end of file diff --git a/tests/files/fca-approach-payment-services-electronic-money-2017-5.pdf b/tests/files/fca-approach-payment-services-electronic-money-2017-5.pdf new file mode 100644 index 0000000..95f5003 Binary files /dev/null and b/tests/files/fca-approach-payment-services-electronic-money-2017-5.pdf differ diff --git a/tests/test_document_loader_docling.py b/tests/test_document_loader_docling.py index 2f3b367..a34dc5d 100644 --- a/tests/test_document_loader_docling.py +++ b/tests/test_document_loader_docling.py @@ -204,3 +204,30 @@ def test_custom_ocr_config(self, test_file_path): assert isinstance(pages, list) assert len(pages) > 0 assert "content" in pages[0] + + def test_title_extraction(self): + """ + Test that a PDF with a recognized Title actually shows that Title + in the extracted text or markdown. + """ + + loader = DocumentLoaderDocling() + + # 1. Provide the path to your custom test file with a known Title + current_dir = os.path.dirname(os.path.abspath(__file__)) + test_pdf_path = os.path.join(current_dir, 'files', 'fca-approach-payment-services-electronic-money-2017-5.pdf') + + # 2. Load it + pages = loader.load(test_pdf_path) + assert pages, "No pages were returned from the PDF." + + # 3. Inspect the text from the first page (or all pages) + page_text = pages[0]["content"] # or loop over pages if you prefer + assert isinstance(page_text, str), "Expected 'content' to be a string." + + # 4. Check that your known Title text is present + # Suppose your PDF has "Document Title" as the Title. + assert "1 Introduction" in page_text, ( + "Expected the recognized Title ('1 Introduction') " + "to appear in the extracted text." + ) \ No newline at end of file