From 9550db8e64c4d638a429be33c10f10f18871f795 Mon Sep 17 00:00:00 2001 From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Date: Wed, 7 Aug 2024 17:16:35 +0200 Subject: [PATCH] docs: improve examples (#27) Signed-off-by: Michele Dolfi --- Dockerfile | 1 - README.md | 8 +- examples/{convert.py => batch_convert.py} | 15 +-- examples/custom_convert.py | 125 ++++++++++++++++++++++ examples/minimal.py | 15 ++- 5 files changed, 139 insertions(+), 25 deletions(-) rename examples/{convert.py => batch_convert.py} (75%) create mode 100644 examples/custom_convert.py diff --git a/Dockerfile b/Dockerfile index b2138a63..3fb81722 100644 --- a/Dockerfile +++ b/Dockerfile @@ -15,7 +15,6 @@ COPY examples/minimal.py /root/minimal.py RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);' RUN python -c 'from docling.document_converter import DocumentConverter; artifacts_path = DocumentConverter.download_models_hf(force=True);' -RUN wget "https://www.ibm.com/docs/en/SSQRB8/com.ibm.spectrum.si.pdfs/IBM_Storage_Insights_Fact_Sheet.pdf" -O /root/factsheet.pdf # On container shell: # > cd /root/ diff --git a/README.md b/README.md index 8c11f087..1acb5c71 100644 --- a/README.md +++ b/README.md @@ -56,17 +56,21 @@ print(doc.export_to_markdown()) # output: "## DocLayNet: A Large Human-Annotate ### Convert a batch of documents -For an example of batch-converting documents, see [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py). +For an example of batch-converting documents, see [batch_convert.py](https://github.com/DS4SD/docling/blob/main/examples/batch_convert.py). From a local repo clone, you can run it with: ``` -python examples/convert.py +python examples/batch_convert.py ``` The output of the above command will be written to `./scratch`. ### Adjust pipeline features +The example file [custom_convert.py](https://github.com/DS4SD/docling/blob/main/examples/custom_convert.py) contains multiple ways +one can adjust the conversion pipeline and features. + + #### Control pipeline options You can control if table structure recognition or OCR should be performed by arguments passed to `DocumentConverter`: diff --git a/examples/convert.py b/examples/batch_convert.py similarity index 75% rename from examples/convert.py rename to examples/batch_convert.py index be216406..e54860e0 100644 --- a/examples/convert.py +++ b/examples/batch_convert.py @@ -4,9 +4,7 @@ from pathlib import Path from typing import Iterable -# from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend -from docling.backend.docling_parse_backend import DoclingParseDocumentBackend -from docling.datamodel.base_models import ConversionStatus, PipelineOptions +from docling.datamodel.base_models import ConversionStatus from docling.datamodel.document import ConvertedDocument, DocumentConversionInput from docling.document_converter import DocumentConverter @@ -52,16 +50,7 @@ def main(): Path("./test/data/2305.03393v1.pdf"), ] - artifacts_path = DocumentConverter.download_models_hf() - - pipeline_options = PipelineOptions(do_table_structure=True) - pipeline_options.table_structure_options.do_cell_matching = True - - doc_converter = DocumentConverter( - artifacts_path=artifacts_path, - pipeline_options=pipeline_options, - pdf_backend=DoclingParseDocumentBackend, - ) + doc_converter = DocumentConverter() input = DocumentConversionInput.from_paths(input_doc_paths) diff --git a/examples/custom_convert.py b/examples/custom_convert.py new file mode 100644 index 00000000..8aab1f47 --- /dev/null +++ b/examples/custom_convert.py @@ -0,0 +1,125 @@ +import json +import logging +import time +from pathlib import Path +from typing import Iterable + +from docling.backend.docling_parse_backend import DoclingParseDocumentBackend +from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend +from docling.datamodel.base_models import ConversionStatus, PipelineOptions +from docling.datamodel.document import ConvertedDocument, DocumentConversionInput +from docling.document_converter import DocumentConverter + +_log = logging.getLogger(__name__) + + +def export_documents( + converted_docs: Iterable[ConvertedDocument], + output_dir: Path, +): + output_dir.mkdir(parents=True, exist_ok=True) + + success_count = 0 + failure_count = 0 + + for doc in converted_docs: + if doc.status == ConversionStatus.SUCCESS: + success_count += 1 + doc_filename = doc.input.file.stem + + # Export Deep Search document JSON format: + with (output_dir / f"{doc_filename}.json").open("w") as fp: + fp.write(json.dumps(doc.render_as_dict())) + + # Export Markdown format: + with (output_dir / f"{doc_filename}.md").open("w") as fp: + fp.write(doc.render_as_markdown()) + else: + _log.info(f"Document {doc.input.file} failed to convert.") + failure_count += 1 + + _log.info( + f"Processed {success_count + failure_count} docs, of which {failure_count} failed" + ) + + +def main(): + logging.basicConfig(level=logging.INFO) + + input_doc_paths = [ + Path("./test/data/2206.01062.pdf"), + Path("./test/data/2203.01017v2.pdf"), + Path("./test/data/2305.03393v1.pdf"), + ] + + ########################################################################### + + # The following sections contain a combination of PipelineOptions + # and PDF Backends for various configurations. + # Uncomment one section at the time to see the differences in the output. + + # PyPdfium without OCR + # -------------------- + # pipeline_options = PipelineOptions() + # pipeline_options.do_ocr=False + # pipeline_options.do_table_structure=True + # pipeline_options.table_structure_options.do_cell_matching = False + + # doc_converter = DocumentConverter( + # pipeline_options=pipeline_options, + # pdf_backend=PyPdfiumDocumentBackend, + # ) + + # PyPdfium with OCR + # ----------------- + # pipeline_options = PipelineOptions() + # pipeline_options.do_ocr=False + # pipeline_options.do_table_structure=True + # pipeline_options.table_structure_options.do_cell_matching = True + + # doc_converter = DocumentConverter( + # pipeline_options=pipeline_options, + # pdf_backend=PyPdfiumDocumentBackend, + # ) + + # Docling Parse without OCR + # ------------------------- + pipeline_options = PipelineOptions() + pipeline_options.do_ocr = False + pipeline_options.do_table_structure = True + pipeline_options.table_structure_options.do_cell_matching = True + + doc_converter = DocumentConverter( + pipeline_options=pipeline_options, + pdf_backend=DoclingParseDocumentBackend, + ) + + # Docling Parse with OCR + # ---------------------- + # pipeline_options = PipelineOptions() + # pipeline_options.do_ocr=True + # pipeline_options.do_table_structure=True + # pipeline_options.table_structure_options.do_cell_matching = True + + # doc_converter = DocumentConverter( + # pipeline_options=pipeline_options, + # pdf_backend=DoclingParseDocumentBackend, + # ) + + ########################################################################### + + # Define input files + input = DocumentConversionInput.from_paths(input_doc_paths) + + start_time = time.time() + + converted_docs = doc_converter.convert(input) + export_documents(converted_docs, output_dir=Path("./scratch")) + + end_time = time.time() - start_time + + _log.info(f"All documents were converted in {end_time:.2f} seconds.") + + +if __name__ == "__main__": + main() diff --git a/examples/minimal.py b/examples/minimal.py index 3f77910b..0ea45a6e 100644 --- a/examples/minimal.py +++ b/examples/minimal.py @@ -1,11 +1,8 @@ -from docling.datamodel.document import DocumentConversionInput from docling.document_converter import DocumentConverter -artifacts_path = DocumentConverter.download_models_hf() -doc_converter = DocumentConverter(artifacts_path=artifacts_path) - -input = DocumentConversionInput.from_paths(["factsheet.pdf"]) -converted_docs = doc_converter.convert(input) - -for d in converted_docs: - print(d.render_as_dict()) +source = "https://arxiv.org/pdf/2206.01062" # PDF path or URL +converter = DocumentConverter() +doc = converter.convert_single(source) +print( + doc.export_to_markdown() +) # output: "## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis [...]"