Add redbooks to test data, small additions (#35)

Signed-off-by: Christoph Auer <[email protected]> Co-authored-by: Christoph Auer <[email protected]>
DS4SD · Aug 20, 2024 · c253dd7 · c253dd7
1 parent a13114b
commit c253dd7
Show file tree

Hide file tree

Showing 5 changed files with 17 additions and 0 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -16,6 +16,9 @@ COPY examples/minimal.py /root/minimal.py
 RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);'
 RUN python -c 'from docling.document_converter import DocumentConverter; artifacts_path = DocumentConverter.download_models_hf(force=True);'
 
+# On container environments, always set a thread budget to avoid undesired thread congestion.
+ENV OMP_NUM_THREADS=4
+
 # On container shell:
 # > cd /root/
 # > python minimal.py
diff --git a/docling/backend/docling_parse_backend.py b/docling/backend/docling_parse_backend.py
@@ -1,4 +1,6 @@
+import logging
 import random
+import time
 from io import BytesIO
 from pathlib import Path
 from typing import Iterable, List, Optional, Union
@@ -11,6 +13,8 @@
 from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
 from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
 
+_log = logging.getLogger(__name__)
+
 
 class DoclingParsePageBackend(PdfPageBackend):
     def __init__(self, page_obj: PdfPage, docling_page_obj):
@@ -151,11 +155,19 @@ def __init__(self, path_or_stream: Union[BytesIO, Path]):
         self._pdoc = pdfium.PdfDocument(path_or_stream)
         # Parsing cells with docling_parser call
         parser = pdf_parser()
+
+        start_pb_time = time.time()
+
         if isinstance(path_or_stream, BytesIO):
             self._parser_doc = parser.find_cells_from_bytesio(path_or_stream)
         else:
             self._parser_doc = parser.find_cells(str(path_or_stream))
 
+        end_pb_time = time.time() - start_pb_time
+        _log.info(
+            f"Time to parse {path_or_stream.name} with docling-parse: time={end_pb_time:.3f}"
+        )
+
     def page_count(self) -> int:
         return len(self._parser_doc["pages"])
 

diff --git a/examples/batch_convert.py b/examples/batch_convert.py
@@ -48,6 +48,8 @@ def main():
         Path("./test/data/2206.01062.pdf"),
         Path("./test/data/2203.01017v2.pdf"),
         Path("./test/data/2305.03393v1.pdf"),
+        Path("./test/data/redp5110.pdf"),
+        Path("./test/data/redp5695.pdf"),
     ]
 
     doc_converter = DocumentConverter()

diff --git a/test/data/redp5110.pdf b/test/data/redp5110.pdf
diff --git a/test/data/redp5695.pdf b/test/data/redp5695.pdf