From 9550db8e64c4d638a429be33c10f10f18871f795 Mon Sep 17 00:00:00 2001
From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
Date: Wed, 7 Aug 2024 17:16:35 +0200
Subject: [PATCH] docs: improve examples (#27)

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
---
 Dockerfile                                |   1 -
 README.md                                 |   8 +-
 examples/{convert.py => batch_convert.py} |  15 +--
 examples/custom_convert.py                | 125 ++++++++++++++++++++++
 examples/minimal.py                       |  15 ++-
 5 files changed, 139 insertions(+), 25 deletions(-)
 rename examples/{convert.py => batch_convert.py} (75%)
 create mode 100644 examples/custom_convert.py

diff --git a/Dockerfile b/Dockerfile
index b2138a63..3fb81722 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -15,7 +15,6 @@ COPY examples/minimal.py /root/minimal.py
 
 RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);'
 RUN python -c 'from docling.document_converter import DocumentConverter; artifacts_path = DocumentConverter.download_models_hf(force=True);'
-RUN wget "https://www.ibm.com/docs/en/SSQRB8/com.ibm.spectrum.si.pdfs/IBM_Storage_Insights_Fact_Sheet.pdf" -O /root/factsheet.pdf
 
 # On container shell:
 # > cd /root/
diff --git a/README.md b/README.md
index 8c11f087..1acb5c71 100644
--- a/README.md
+++ b/README.md
@@ -56,17 +56,21 @@ print(doc.export_to_markdown())  # output: "## DocLayNet: A Large Human-Annotate
 
 ### Convert a batch of documents
 
-For an example of batch-converting documents, see [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py).
+For an example of batch-converting documents, see [batch_convert.py](https://github.com/DS4SD/docling/blob/main/examples/batch_convert.py).
 
 From a local repo clone, you can run it with:
 
 ```
-python examples/convert.py
+python examples/batch_convert.py
 ```
 The output of the above command will be written to `./scratch`.
 
 ### Adjust pipeline features
 
+The example file [custom_convert.py](https://github.com/DS4SD/docling/blob/main/examples/custom_convert.py) contains multiple ways
+one can adjust the conversion pipeline and features.
+
+
 #### Control pipeline options
 
 You can control if table structure recognition or OCR should be performed by arguments passed to `DocumentConverter`:
diff --git a/examples/convert.py b/examples/batch_convert.py
similarity index 75%
rename from examples/convert.py
rename to examples/batch_convert.py
index be216406..e54860e0 100644
--- a/examples/convert.py
+++ b/examples/batch_convert.py
@@ -4,9 +4,7 @@
 from pathlib import Path
 from typing import Iterable
 
-# from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
-from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
-from docling.datamodel.base_models import ConversionStatus, PipelineOptions
+from docling.datamodel.base_models import ConversionStatus
 from docling.datamodel.document import ConvertedDocument, DocumentConversionInput
 from docling.document_converter import DocumentConverter
 
@@ -52,16 +50,7 @@ def main():
         Path("./test/data/2305.03393v1.pdf"),
     ]
 
-    artifacts_path = DocumentConverter.download_models_hf()
-
-    pipeline_options = PipelineOptions(do_table_structure=True)
-    pipeline_options.table_structure_options.do_cell_matching = True
-
-    doc_converter = DocumentConverter(
-        artifacts_path=artifacts_path,
-        pipeline_options=pipeline_options,
-        pdf_backend=DoclingParseDocumentBackend,
-    )
+    doc_converter = DocumentConverter()
 
     input = DocumentConversionInput.from_paths(input_doc_paths)
 
diff --git a/examples/custom_convert.py b/examples/custom_convert.py
new file mode 100644
index 00000000..8aab1f47
--- /dev/null
+++ b/examples/custom_convert.py
@@ -0,0 +1,125 @@
+import json
+import logging
+import time
+from pathlib import Path
+from typing import Iterable
+
+from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
+from docling.datamodel.base_models import ConversionStatus, PipelineOptions
+from docling.datamodel.document import ConvertedDocument, DocumentConversionInput
+from docling.document_converter import DocumentConverter
+
+_log = logging.getLogger(__name__)
+
+
+def export_documents(
+    converted_docs: Iterable[ConvertedDocument],
+    output_dir: Path,
+):
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    success_count = 0
+    failure_count = 0
+
+    for doc in converted_docs:
+        if doc.status == ConversionStatus.SUCCESS:
+            success_count += 1
+            doc_filename = doc.input.file.stem
+
+            # Export Deep Search document JSON format:
+            with (output_dir / f"{doc_filename}.json").open("w") as fp:
+                fp.write(json.dumps(doc.render_as_dict()))
+
+            # Export Markdown format:
+            with (output_dir / f"{doc_filename}.md").open("w") as fp:
+                fp.write(doc.render_as_markdown())
+        else:
+            _log.info(f"Document {doc.input.file} failed to convert.")
+            failure_count += 1
+
+    _log.info(
+        f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
+    )
+
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+
+    input_doc_paths = [
+        Path("./test/data/2206.01062.pdf"),
+        Path("./test/data/2203.01017v2.pdf"),
+        Path("./test/data/2305.03393v1.pdf"),
+    ]
+
+    ###########################################################################
+
+    # The following sections contain a combination of PipelineOptions
+    # and PDF Backends for various configurations.
+    # Uncomment one section at the time to see the differences in the output.
+
+    # PyPdfium without OCR
+    # --------------------
+    # pipeline_options = PipelineOptions()
+    # pipeline_options.do_ocr=False
+    # pipeline_options.do_table_structure=True
+    # pipeline_options.table_structure_options.do_cell_matching = False
+
+    # doc_converter = DocumentConverter(
+    #     pipeline_options=pipeline_options,
+    #     pdf_backend=PyPdfiumDocumentBackend,
+    # )
+
+    # PyPdfium with OCR
+    # -----------------
+    # pipeline_options = PipelineOptions()
+    # pipeline_options.do_ocr=False
+    # pipeline_options.do_table_structure=True
+    # pipeline_options.table_structure_options.do_cell_matching = True
+
+    # doc_converter = DocumentConverter(
+    #     pipeline_options=pipeline_options,
+    #     pdf_backend=PyPdfiumDocumentBackend,
+    # )
+
+    # Docling Parse without OCR
+    # -------------------------
+    pipeline_options = PipelineOptions()
+    pipeline_options.do_ocr = False
+    pipeline_options.do_table_structure = True
+    pipeline_options.table_structure_options.do_cell_matching = True
+
+    doc_converter = DocumentConverter(
+        pipeline_options=pipeline_options,
+        pdf_backend=DoclingParseDocumentBackend,
+    )
+
+    # Docling Parse with OCR
+    # ----------------------
+    # pipeline_options = PipelineOptions()
+    # pipeline_options.do_ocr=True
+    # pipeline_options.do_table_structure=True
+    # pipeline_options.table_structure_options.do_cell_matching = True
+
+    # doc_converter = DocumentConverter(
+    #     pipeline_options=pipeline_options,
+    #     pdf_backend=DoclingParseDocumentBackend,
+    # )
+
+    ###########################################################################
+
+    # Define input files
+    input = DocumentConversionInput.from_paths(input_doc_paths)
+
+    start_time = time.time()
+
+    converted_docs = doc_converter.convert(input)
+    export_documents(converted_docs, output_dir=Path("./scratch"))
+
+    end_time = time.time() - start_time
+
+    _log.info(f"All documents were converted in {end_time:.2f} seconds.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/minimal.py b/examples/minimal.py
index 3f77910b..0ea45a6e 100644
--- a/examples/minimal.py
+++ b/examples/minimal.py
@@ -1,11 +1,8 @@
-from docling.datamodel.document import DocumentConversionInput
 from docling.document_converter import DocumentConverter
 
-artifacts_path = DocumentConverter.download_models_hf()
-doc_converter = DocumentConverter(artifacts_path=artifacts_path)
-
-input = DocumentConversionInput.from_paths(["factsheet.pdf"])
-converted_docs = doc_converter.convert(input)
-
-for d in converted_docs:
-    print(d.render_as_dict())
+source = "https://arxiv.org/pdf/2206.01062"  # PDF path or URL
+converter = DocumentConverter()
+doc = converter.convert_single(source)
+print(
+    doc.export_to_markdown()
+)  # output: "## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis [...]"