From ed785ea122d8d736c2031a38fce81dc5c19e244c Mon Sep 17 00:00:00 2001 From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Date: Tue, 19 Nov 2024 15:58:49 +0100 Subject: [PATCH] feat: expose ocr-lang in CLI (#375) * feat: expose ocr-lang in CLI Signed-off-by: Michele Dolfi * use regex for supporting multiple sep Signed-off-by: Michele Dolfi --------- Signed-off-by: Michele Dolfi --- docling/cli/main.py | 18 ++++++++++++++++++ docling/datamodel/pipeline_options.py | 1 + 2 files changed, 19 insertions(+) diff --git a/docling/cli/main.py b/docling/cli/main.py index c95128ac..a2a86bf4 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -1,6 +1,7 @@ import importlib import json import logging +import re import time import warnings from enum import Enum @@ -129,6 +130,12 @@ def export_documents( ) +def _split_list(raw: Optional[str]) -> Optional[List[str]]: + if raw is None: + return None + return re.split(r"[;,]", raw) + + @app.command(no_args_is_help=True) def convert( input_sources: Annotated[ @@ -163,6 +170,13 @@ def convert( ocr_engine: Annotated[ OcrEngine, typer.Option(..., help="The OCR engine to use.") ] = OcrEngine.EASYOCR, + ocr_lang: Annotated[ + Optional[str], + typer.Option( + ..., + help="Provide a comma-separated list of languages used by the OCR engine. Note that each OCR engine has different values for the language names.", + ), + ] = None, pdf_backend: Annotated[ PdfBackend, typer.Option(..., help="The PDF backend to use.") ] = PdfBackend.DLPARSE_V1, @@ -248,6 +262,10 @@ def convert( case _: raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}") + ocr_lang_list = _split_list(ocr_lang) + if ocr_lang_list is not None: + ocr_options.lang = ocr_lang_list + pipeline_options = PdfPipelineOptions( do_ocr=ocr, ocr_options=ocr_options, diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 2b9d228c..6c0711cc 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -22,6 +22,7 @@ class TableStructureOptions(BaseModel): class OcrOptions(BaseModel): kind: str + lang: List[str] force_full_page_ocr: bool = False # If enabled a full page OCR is always applied bitmap_area_threshold: float = ( 0.05 # percentage of the area for a bitmap to processed with OCR