diff --git a/docling/cli/main.py b/docling/cli/main.py index f5e2e1a1..35ae01df 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -5,12 +5,15 @@ import warnings from enum import Enum from pathlib import Path -from typing import Annotated, Dict, Iterable, List, Optional +from typing import Annotated, Dict, Iterable, List, Optional, Type import typer from docling_core.utils.file import resolve_file_source from docling.backend.docling_parse_backend import DoclingParseDocumentBackend +from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend +from docling.backend.pdf_backend import PdfDocumentBackend +from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.datamodel.base_models import ( ConversionStatus, FormatToExtensions, @@ -22,6 +25,7 @@ EasyOcrOptions, OcrOptions, PdfPipelineOptions, + TableFormerMode, TesseractCliOcrOptions, TesseractOcrOptions, ) @@ -58,9 +62,10 @@ def version_callback(value: bool): # Define an enum for the backend options -class Backend(str, Enum): +class PdfBackend(str, Enum): PYPDFIUM2 = "pypdfium2" - DOCLING = "docling" + DLPARSE_V1 = "dlparse_v1" + DLPARSE_V2 = "dlparse_v2" # Define an enum for the ocr engines @@ -151,6 +156,17 @@ def convert( ocr_engine: Annotated[ OcrEngine, typer.Option(..., help="The OCR engine to use.") ] = OcrEngine.EASYOCR, + pdf_backend: Annotated[ + PdfBackend, typer.Option(..., help="The PDF backend to use.") + ] = PdfBackend.DLPARSE_V1, + table_mode: Annotated[ + TableFormerMode, + typer.Option(..., help="The mode to use in the table structure model."), + ] = TableFormerMode.FAST, + artifacts_path: Annotated[ + Optional[Path], + typer.Option(..., help="If provided, the location of the model artifacts."), + ] = None, abort_on_error: Annotated[ bool, typer.Option( @@ -217,11 +233,25 @@ def convert( do_table_structure=True, ) pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching + pipeline_options.table_structure_options.mode = table_mode + + if artifacts_path is not None: + pipeline_options.artifacts_path = artifacts_path + + match pdf_backend: + case PdfBackend.DLPARSE_V1: + backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend + case PdfBackend.DLPARSE_V2: + backend = DoclingParseV2DocumentBackend + case PdfBackend.PYPDFIUM2: + backend = PyPdfiumDocumentBackend + case _: + raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}") format_options: Dict[InputFormat, FormatOption] = { InputFormat.PDF: PdfFormatOption( pipeline_options=pipeline_options, - backend=DoclingParseDocumentBackend, # pdf_backend + backend=backend, # pdf_backend ) } doc_converter = DocumentConverter( diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index efaa6ff8..d57f1671 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -1,4 +1,4 @@ -from enum import Enum, auto +from enum import Enum from pathlib import Path from typing import List, Literal, Optional, Union @@ -6,8 +6,8 @@ class TableFormerMode(str, Enum): - FAST = auto() - ACCURATE = auto() + FAST = "fast" + ACCURATE = "accurate" class TableStructureOptions(BaseModel): diff --git a/docs/usage.md b/docs/usage.md index 5493c2ee..a5be4299 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -32,30 +32,37 @@ Here are the available options as of this writing (for an up-to-date listing, ru ```console $ docling --help - Usage: docling [OPTIONS] source - + Usage: docling [OPTIONS] source + ╭─ Arguments ───────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ │ * input_sources source PDF files to convert. Can be local file / directory paths or URL. [default: None] │ │ [required] │ ╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ ╭─ Options ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ -│ --from [docx|pptx|html|image|pdf] Specify input formats to convert from. │ -│ Defaults to all formats. │ -│ [default: None] │ -│ --to [md|json|text|doctags] Specify output formats. Defaults to │ -│ Markdown. │ -│ [default: None] │ -│ --ocr --no-ocr If enabled, the bitmap content will be │ -│ processed using OCR. │ -│ [default: ocr] │ -│ --ocr-engine [easyocr|tesseract_cli|tesseract] The OCR engine to use. [default: easyocr] │ -│ --abort-on-error --no-abort-on-error If enabled, the bitmap content will be │ -│ processed using OCR. │ -│ [default: no-abort-on-error] │ -│ --output PATH Output directory where results are saved. │ -│ [default: .] │ -│ --version Show version information. │ -│ --help Show this message and exit. │ +│ --from [docx|pptx|html|image|pdf|asciidoc|md] Specify input formats to convert from. │ +│ Defaults to all formats. │ +│ [default: None] │ +│ --to [md|json|text|doctags] Specify output formats. Defaults to │ +│ Markdown. │ +│ [default: None] │ +│ --ocr --no-ocr If enabled, the bitmap content will be │ +│ processed using OCR. │ +│ [default: ocr] │ +│ --ocr-engine [easyocr|tesseract_cli|tesseract] The OCR engine to use. │ +│ [default: easyocr] │ +│ --pdf-backend [pypdfium2|dlparse_v1|dlparse_v2] The PDF backend to use. │ +│ [default: dlparse_v1] │ +│ --table-mode [fast|accurate] The mode to use in the table structure │ +│ model. │ +│ [default: fast] │ +│ --abort-on-error --no-abort-on-error If enabled, the bitmap content will be │ +│ processed using OCR. │ +│ [default: no-abort-on-error] │ +│ --output PATH Output directory where results are │ +│ saved. │ +│ [default: .] │ +│ --version Show version information. │ +│ --help Show this message and exit. │ ╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ ```