Skip to content

Commit

Permalink
Add redbooks to test data, small additions (#35)
Browse files Browse the repository at this point in the history
Signed-off-by: Christoph Auer <[email protected]>
Co-authored-by: Christoph Auer <[email protected]>
  • Loading branch information
cau-git and cau-git authored Aug 20, 2024
1 parent a13114b commit c253dd7
Show file tree
Hide file tree
Showing 5 changed files with 17 additions and 0 deletions.
3 changes: 3 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ COPY examples/minimal.py /root/minimal.py
RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);'
RUN python -c 'from docling.document_converter import DocumentConverter; artifacts_path = DocumentConverter.download_models_hf(force=True);'

# On container environments, always set a thread budget to avoid undesired thread congestion.
ENV OMP_NUM_THREADS=4

# On container shell:
# > cd /root/
# > python minimal.py
12 changes: 12 additions & 0 deletions docling/backend/docling_parse_backend.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import logging
import random
import time
from io import BytesIO
from pathlib import Path
from typing import Iterable, List, Optional, Union
Expand All @@ -11,6 +13,8 @@
from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize

_log = logging.getLogger(__name__)


class DoclingParsePageBackend(PdfPageBackend):
def __init__(self, page_obj: PdfPage, docling_page_obj):
Expand Down Expand Up @@ -151,11 +155,19 @@ def __init__(self, path_or_stream: Union[BytesIO, Path]):
self._pdoc = pdfium.PdfDocument(path_or_stream)
# Parsing cells with docling_parser call
parser = pdf_parser()

start_pb_time = time.time()

if isinstance(path_or_stream, BytesIO):
self._parser_doc = parser.find_cells_from_bytesio(path_or_stream)
else:
self._parser_doc = parser.find_cells(str(path_or_stream))

end_pb_time = time.time() - start_pb_time
_log.info(
f"Time to parse {path_or_stream.name} with docling-parse: time={end_pb_time:.3f}"
)

def page_count(self) -> int:
return len(self._parser_doc["pages"])

Expand Down
2 changes: 2 additions & 0 deletions examples/batch_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ def main():
Path("./test/data/2206.01062.pdf"),
Path("./test/data/2203.01017v2.pdf"),
Path("./test/data/2305.03393v1.pdf"),
Path("./test/data/redp5110.pdf"),
Path("./test/data/redp5695.pdf"),
]

doc_converter = DocumentConverter()
Expand Down
Binary file added test/data/redp5110.pdf
Binary file not shown.
Binary file added test/data/redp5695.pdf
Binary file not shown.

0 comments on commit c253dd7

Please sign in to comment.