diff --git a/docling/cli/main.py b/docling/cli/main.py index c95128ac..a2a86bf4 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -1,6 +1,7 @@ import importlib import json import logging +import re import time import warnings from enum import Enum @@ -129,6 +130,12 @@ def export_documents( ) +def _split_list(raw: Optional[str]) -> Optional[List[str]]: + if raw is None: + return None + return re.split(r"[;,]", raw) + + @app.command(no_args_is_help=True) def convert( input_sources: Annotated[ @@ -163,6 +170,13 @@ def convert( ocr_engine: Annotated[ OcrEngine, typer.Option(..., help="The OCR engine to use.") ] = OcrEngine.EASYOCR, + ocr_lang: Annotated[ + Optional[str], + typer.Option( + ..., + help="Provide a comma-separated list of languages used by the OCR engine. Note that each OCR engine has different values for the language names.", + ), + ] = None, pdf_backend: Annotated[ PdfBackend, typer.Option(..., help="The PDF backend to use.") ] = PdfBackend.DLPARSE_V1, @@ -248,6 +262,10 @@ def convert( case _: raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}") + ocr_lang_list = _split_list(ocr_lang) + if ocr_lang_list is not None: + ocr_options.lang = ocr_lang_list + pipeline_options = PdfPipelineOptions( do_ocr=ocr, ocr_options=ocr_options, diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 2b9d228c..6c0711cc 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -22,6 +22,7 @@ class TableStructureOptions(BaseModel): class OcrOptions(BaseModel): kind: str + lang: List[str] force_full_page_ocr: bool = False # If enabled a full page OCR is always applied bitmap_area_threshold: float = ( 0.05 # percentage of the area for a bitmap to processed with OCR