From 8cc147bc56753144915709a48b08830d0c3ad44e Mon Sep 17 00:00:00 2001 From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Date: Mon, 26 Aug 2024 13:30:26 +0200 Subject: [PATCH] fix: align output formats (#49) Signed-off-by: Michele Dolfi --- docling/document_converter.py | 7 +++---- examples/minimal.py | 6 ++---- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/docling/document_converter.py b/docling/document_converter.py index 8a71a570..e637f18c 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -88,7 +88,7 @@ def convert(self, input: DocumentConversionInput) -> Iterable[ConvertedDocument] # Note: Pdfium backend is not thread-safe, thread pool usage was disabled. yield from map(self.process_document, input_batch) - def convert_single(self, source: Path | AnyHttpUrl | str) -> Document: + def convert_single(self, source: Path | AnyHttpUrl | str) -> ConvertedDocument: """Convert a single document. Args: @@ -133,11 +133,10 @@ def convert_single(self, source: Path | AnyHttpUrl | str) -> Document: converted_doc: ConvertedDocument = next(converted_docs_iter) if converted_doc.status not in { ConversionStatus.SUCCESS, - ConversionStatus.SUCCESS_WITH_ERRORS, + ConversionStatus.PARTIAL_SUCCESS, }: raise RuntimeError(f"Conversion failed with status: {converted_doc.status}") - doc = converted_doc.to_ds_document() - return doc + return converted_doc def process_document(self, in_doc: InputDocument) -> ConvertedDocument: start_doc_time = time.time() diff --git a/examples/minimal.py b/examples/minimal.py index 0ea45a6e..837db718 100644 --- a/examples/minimal.py +++ b/examples/minimal.py @@ -1,8 +1,6 @@ from docling.document_converter import DocumentConverter -source = "https://arxiv.org/pdf/2206.01062" # PDF path or URL +source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL converter = DocumentConverter() doc = converter.convert_single(source) -print( - doc.export_to_markdown() -) # output: "## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis [...]" +print(doc.render_as_markdown()) # output: ## Docling Technical Report [...]"