From 3da166eafa3c119de961510341cb92397652c222 Mon Sep 17 00:00:00 2001 From: Abhishek Kumar Date: Wed, 11 Dec 2024 19:36:10 +0530 Subject: [PATCH] feat: Add timeout limit to document parsing job. DS4SD#270 (#552) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Abhishek Kumar Testing: (.venv) mario@Abhisheks-MacBook-Air docling % docling https://arxiv.org/pdf/2206.01062 --document-timeout=10 --verbose INFO:docling.document_converter:Going to convert document batch... INFO:docling.pipeline.base_pipeline:Processing document 2206.01062v1.pdf WARNING:docling.pipeline.base_pipeline:Document processing time (24.555 seconds) exceeded the specified timeout of 10.000 seconds INFO:docling.document_converter:Finished converting document 2206.01062v1.pdf in 36.29 sec. WARNING:docling.cli.main:Document /var/folders/d7/dsfkllxs0xs8x2t4fcjknj4c0000gn/T/tmpl6p08u5i/2206.01062v1.pdf failed to convert. INFO:docling.cli.main:Processed 1 docs, of which 1 failed INFO:docling.cli.main:All documents were converted in 36.29 seconds. (.venv) mario@Abhisheks-MacBook-Air docling % docling https://arxiv.org/pdf/2206.01062 --document-timeout=100 --verbose INFO:docling.document_converter:Going to convert document batch... INFO:docling.pipeline.base_pipeline:Processing document 2206.01062v1.pdf INFO:docling.document_converter:Finished converting document 2206.01062v1.pdf in 58.36 sec. INFO:docling.cli.main:writing Markdown output to 2206.01062v1.md INFO:docling.cli.main:Processed 1 docs, of which 0 failed INFO:docling.cli.main:All documents were converted in 58.56 seconds. (.venv) mario@Abhisheks-MacBook-Air docling % docling https://arxiv.org/pdf/2206.01062 --verbose INFO:docling.document_converter:Going to convert document batch... INFO:docling.pipeline.base_pipeline:Processing document 2206.01062v1.pdf INFO:docling.document_converter:Finished converting document 2206.01062v1.pdf in 59.82 sec. INFO:docling.cli.main:writing Markdown output to 2206.01062v1.md INFO:docling.cli.main:Processed 1 docs, of which 0 failed INFO:docling.cli.main:All documents were converted in 59.88 seconds. (.venv) mario@Abhisheks-MacBook-Air docling % docling Usage: docling [OPTIONS] source ╭─ Arguments ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ │ * input_sources source PDF files to convert. Can be local file / directory paths or URL. [default: None] [required] │ ╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ ╭─ Options ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ │ --from [docx|pptx|html|image|pdf|asciido Specify input formats to convert │ │ c|md|xlsx] from. Defaults to all formats. │ │ [default: None] │ │ --to [md|json|html|text|doctags] Specify output formats. Defaults to │ │ Markdown. │ │ [default: None] │ │ --image-export-mode [placeholder|embedded|referenced] Image export mode for the document │ │ (only in case of JSON, Markdown or │ │ HTML). With `placeholder`, only the │ │ position of the image is marked in │ │ the output. In `embedded` mode, the │ │ image is embedded as base64 encoded │ │ string. In `referenced` mode, the │ │ image is exported in PNG format and │ │ referenced from the main exported │ │ document. │ │ [default: embedded] │ │ --ocr --no-ocr If enabled, the bitmap content will │ │ be processed using OCR. │ │ [default: ocr] │ │ --force-ocr --no-force-ocr Replace any existing text with OCR │ │ generated text over the full │ │ content. │ │ [default: no-force-ocr] │ │ --ocr-engine [easyocr|tesseract_cli|tesseract| The OCR engine to use. │ │ ocrmac|rapidocr] [default: easyocr] │ │ --ocr-lang TEXT Provide a comma-separated list of │ │ languages used by the OCR engine. │ │ Note that each OCR engine has │ │ different values for the language │ │ names. │ │ [default: None] │ │ --pdf-backend [pypdfium2|dlparse_v1|dlparse_v2] The PDF backend to use. │ │ [default: dlparse_v2] │ │ --table-mode [fast|accurate] The mode to use in the table │ │ structure model. │ │ [default: fast] │ │ --artifacts-path PATH If provided, the location of the │ │ model artifacts. │ │ [default: None] │ │ --abort-on-error --no-abort-on-error If enabled, the bitmap content will │ │ be processed using OCR. │ │ [default: no-abort-on-error] │ │ --output PATH Output directory where results are │ │ saved. │ │ [default: .] │ │ --verbose -v INTEGER Set the verbosity level. -v for │ │ info logging, -vv for debug │ │ logging. │ │ [default: 0] │ │ --debug-visualize-cells --no-debug-visualize-cells Enable debug output which │ │ visualizes the PDF cells │ │ [default: no-debug-visualize-cells] │ │ --debug-visualize-ocr --no-debug-visualize-ocr Enable debug output which │ │ visualizes the OCR cells │ │ [default: no-debug-visualize-ocr] │ │ --debug-visualize-layout --no-debug-visualize-layout Enable debug output which │ │ visualizes the layour clusters │ │ [default: │ │ no-debug-visualize-layout] │ │ --debug-visualize-tables --no-debug-visualize-tables Enable debug output which │ │ visualizes the table cells │ │ [default: │ │ no-debug-visualize-tables] │ │ --version Show version information. │ │ --document-timeout FLOAT The timeout for processing each │ │ document, in seconds. │ │ [default: None] │ │ --help Show this message and exit. │ ╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ --- docling/cli/main.py | 8 ++++++++ docling/datamodel/pipeline_options.py | 3 ++- docling/pipeline/base_pipeline.py | 20 +++++++++++++++++--- 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/docling/cli/main.py b/docling/cli/main.py index 260d8152..f9446ee2 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -250,6 +250,13 @@ def convert( help="Show version information.", ), ] = None, + document_timeout: Annotated[ + Optional[float], + typer.Option( + ..., + help="The timeout for processing each document, in seconds.", + ), + ] = None, ): if verbose == 0: logging.basicConfig(level=logging.WARNING) @@ -333,6 +340,7 @@ def convert( do_ocr=ocr, ocr_options=ocr_options, do_table_structure=True, + document_timeout=document_timeout, ) pipeline_options.table_structure_options.do_cell_matching = ( True # do_cell_matching diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 235b5b7f..3d240457 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -150,8 +150,9 @@ class PipelineOptions(BaseModel): """Base pipeline options.""" create_legacy_output: bool = ( - True # This defautl will be set to False on a future version of docling + True # This default will be set to False on a future version of docling ) + document_timeout: Optional[float] = None class PdfPipelineOptions(PipelineOptions): diff --git a/docling/pipeline/base_pipeline.py b/docling/pipeline/base_pipeline.py index 5013ad58..5d3b7686 100644 --- a/docling/pipeline/base_pipeline.py +++ b/docling/pipeline/base_pipeline.py @@ -126,6 +126,7 @@ def _build_document(self, conv_res: ConversionResult) -> ConversionResult: # conv_res.status = ConversionStatus.FAILURE # return conv_res + total_elapsed_time = 0.0 with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT): for i in range(0, conv_res.input.page_count): @@ -136,7 +137,7 @@ def _build_document(self, conv_res: ConversionResult) -> ConversionResult: for page_batch in chunkify( conv_res.pages, settings.perf.page_batch_size ): - start_pb_time = time.time() + start_batch_time = time.monotonic() # 1. Initialise the page resources init_pages = map( @@ -149,8 +150,21 @@ def _build_document(self, conv_res: ConversionResult) -> ConversionResult: for p in pipeline_pages: # Must exhaust! pass - end_pb_time = time.time() - start_pb_time - _log.debug(f"Finished converting page batch time={end_pb_time:.3f}") + end_batch_time = time.monotonic() + total_elapsed_time += end_batch_time - start_batch_time + if ( + self.pipeline_options.document_timeout is not None + and total_elapsed_time > self.pipeline_options.document_timeout + ): + _log.warning( + f"Document processing time ({total_elapsed_time:.3f} seconds) exceeded the specified timeout of {self.pipeline_options.document_timeout:.3f} seconds" + ) + conv_res.status = ConversionStatus.PARTIAL_SUCCESS + break + + _log.debug( + f"Finished converting page batch time={end_batch_time:.3f}" + ) except Exception as e: conv_res.status = ConversionStatus.FAILURE