Skip to content

Commit ed785ea

Browse files
authored
feat: expose ocr-lang in CLI (#375)
* feat: expose ocr-lang in CLI Signed-off-by: Michele Dolfi <[email protected]> * use regex for supporting multiple sep Signed-off-by: Michele Dolfi <[email protected]> --------- Signed-off-by: Michele Dolfi <[email protected]>
1 parent 926dfd2 commit ed785ea

File tree

2 files changed

+19
-0
lines changed

2 files changed

+19
-0
lines changed

docling/cli/main.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import importlib
22
import json
33
import logging
4+
import re
45
import time
56
import warnings
67
from enum import Enum
@@ -129,6 +130,12 @@ def export_documents(
129130
)
130131

131132

133+
def _split_list(raw: Optional[str]) -> Optional[List[str]]:
134+
if raw is None:
135+
return None
136+
return re.split(r"[;,]", raw)
137+
138+
132139
@app.command(no_args_is_help=True)
133140
def convert(
134141
input_sources: Annotated[
@@ -163,6 +170,13 @@ def convert(
163170
ocr_engine: Annotated[
164171
OcrEngine, typer.Option(..., help="The OCR engine to use.")
165172
] = OcrEngine.EASYOCR,
173+
ocr_lang: Annotated[
174+
Optional[str],
175+
typer.Option(
176+
...,
177+
help="Provide a comma-separated list of languages used by the OCR engine. Note that each OCR engine has different values for the language names.",
178+
),
179+
] = None,
166180
pdf_backend: Annotated[
167181
PdfBackend, typer.Option(..., help="The PDF backend to use.")
168182
] = PdfBackend.DLPARSE_V1,
@@ -248,6 +262,10 @@ def convert(
248262
case _:
249263
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
250264

265+
ocr_lang_list = _split_list(ocr_lang)
266+
if ocr_lang_list is not None:
267+
ocr_options.lang = ocr_lang_list
268+
251269
pipeline_options = PdfPipelineOptions(
252270
do_ocr=ocr,
253271
ocr_options=ocr_options,

docling/datamodel/pipeline_options.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ class TableStructureOptions(BaseModel):
2222

2323
class OcrOptions(BaseModel):
2424
kind: str
25+
lang: List[str]
2526
force_full_page_ocr: bool = False # If enabled a full page OCR is always applied
2627
bitmap_area_threshold: float = (
2728
0.05 # percentage of the area for a bitmap to processed with OCR

0 commit comments

Comments
 (0)