diff --git a/Dockerfile b/Dockerfile index dd87b0fb..9bed78f3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -17,7 +17,7 @@ ENV TORCH_HOME=/tmp/ COPY examples/minimal.py /root/minimal.py RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);' -RUN python -c 'from docling.document_converter import DocumentConverter; artifacts_path = DocumentConverter.download_models_hf(force=True);' +RUN python -c 'from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline; StandardPdfPipeline.download_models_hf(force=True);' # On container environments, always set a thread budget to avoid undesired thread congestion. ENV OMP_NUM_THREADS=4 diff --git a/docs/examples/custom_convert.py b/docs/examples/custom_convert.py index 249c7f78..204ae59f 100644 --- a/docs/examples/custom_convert.py +++ b/docs/examples/custom_convert.py @@ -3,9 +3,12 @@ import time from pathlib import Path +from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.document_converter import DocumentConverter, PdfFormatOption +from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions +from docling.models.tesseract_ocr_model import TesseractOcrOptions _log = logging.getLogger(__name__) @@ -23,32 +26,51 @@ def main(): # PyPdfium without EasyOCR # -------------------- - # pipeline_options = PipelineOptions() - # pipeline_options.do_ocr=False - # pipeline_options.do_table_structure=True + # pipeline_options = PdfPipelineOptions() + # pipeline_options.do_ocr = False + # pipeline_options.do_table_structure = True # pipeline_options.table_structure_options.do_cell_matching = False # doc_converter = DocumentConverter( - # pipeline_options=pipeline_options, - # pdf_backend=PyPdfiumDocumentBackend, + # format_options={ + # InputFormat.PDF: PdfFormatOption( + # pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend + # ) + # } # ) # PyPdfium with EasyOCR # ----------------- - # pipeline_options = PipelineOptions() - # pipeline_options.do_ocr=True - # pipeline_options.do_table_structure=True + # pipeline_options = PdfPipelineOptions() + # pipeline_options.do_ocr = True + # pipeline_options.do_table_structure = True # pipeline_options.table_structure_options.do_cell_matching = True # doc_converter = DocumentConverter( - # pipeline_options=pipeline_options, - # pdf_backend=PyPdfiumDocumentBackend, + # format_options={ + # InputFormat.PDF: PdfFormatOption( + # pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend + # ) + # } # ) # Docling Parse without EasyOCR # ------------------------- + # pipeline_options = PdfPipelineOptions() + # pipeline_options.do_ocr = False + # pipeline_options.do_table_structure = True + # pipeline_options.table_structure_options.do_cell_matching = True + + # doc_converter = DocumentConverter( + # format_options={ + # InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) + # } + # ) + + # Docling Parse with EasyOCR + # ---------------------- pipeline_options = PdfPipelineOptions() - pipeline_options.do_ocr = False + pipeline_options.do_ocr = True pipeline_options.do_table_structure = True pipeline_options.table_structure_options.do_cell_matching = True @@ -58,42 +80,32 @@ def main(): } ) - # Docling Parse with EasyOCR - # ---------------------- - # pipeline_options = PipelineOptions() - # pipeline_options.do_ocr=True - # pipeline_options.do_table_structure=True - # pipeline_options.table_structure_options.do_cell_matching = True - - # doc_converter = DocumentConverter( - # pipeline_options=pipeline_options, - # pdf_backend=DoclingParseDocumentBackend, - # ) - # Docling Parse with Tesseract # ---------------------- - # pipeline_options = PipelineOptions() + # pipeline_options = PdfPipelineOptions() # pipeline_options.do_ocr = True # pipeline_options.do_table_structure = True # pipeline_options.table_structure_options.do_cell_matching = True # pipeline_options.ocr_options = TesseractOcrOptions() # doc_converter = DocumentConverter( - # pipeline_options=pipeline_options, - # pdf_backend=DoclingParseDocumentBackend, + # format_options={ + # InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) + # } # ) # Docling Parse with Tesseract CLI # ---------------------- - # pipeline_options = PipelineOptions() + # pipeline_options = PdfPipelineOptions() # pipeline_options.do_ocr = True # pipeline_options.do_table_structure = True # pipeline_options.table_structure_options.do_cell_matching = True # pipeline_options.ocr_options = TesseractCliOcrOptions() # doc_converter = DocumentConverter( - # pipeline_options=pipeline_options, - # pdf_backend=DoclingParseDocumentBackend, + # format_options={ + # InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) + # } # ) ###########################################################################