Skip to content

Commit

Permalink
feat: expose ocr-lang in CLI (#375)
Browse files Browse the repository at this point in the history
* feat: expose ocr-lang in CLI

Signed-off-by: Michele Dolfi <[email protected]>

* use regex for supporting multiple sep

Signed-off-by: Michele Dolfi <[email protected]>

---------

Signed-off-by: Michele Dolfi <[email protected]>
  • Loading branch information
dolfim-ibm authored Nov 19, 2024
1 parent 926dfd2 commit ed785ea
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 0 deletions.
18 changes: 18 additions & 0 deletions docling/cli/main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import importlib
import json
import logging
import re
import time
import warnings
from enum import Enum
Expand Down Expand Up @@ -129,6 +130,12 @@ def export_documents(
)


def _split_list(raw: Optional[str]) -> Optional[List[str]]:
if raw is None:
return None
return re.split(r"[;,]", raw)


@app.command(no_args_is_help=True)
def convert(
input_sources: Annotated[
Expand Down Expand Up @@ -163,6 +170,13 @@ def convert(
ocr_engine: Annotated[
OcrEngine, typer.Option(..., help="The OCR engine to use.")
] = OcrEngine.EASYOCR,
ocr_lang: Annotated[
Optional[str],
typer.Option(
...,
help="Provide a comma-separated list of languages used by the OCR engine. Note that each OCR engine has different values for the language names.",
),
] = None,
pdf_backend: Annotated[
PdfBackend, typer.Option(..., help="The PDF backend to use.")
] = PdfBackend.DLPARSE_V1,
Expand Down Expand Up @@ -248,6 +262,10 @@ def convert(
case _:
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")

ocr_lang_list = _split_list(ocr_lang)
if ocr_lang_list is not None:
ocr_options.lang = ocr_lang_list

pipeline_options = PdfPipelineOptions(
do_ocr=ocr,
ocr_options=ocr_options,
Expand Down
1 change: 1 addition & 0 deletions docling/datamodel/pipeline_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ class TableStructureOptions(BaseModel):

class OcrOptions(BaseModel):
kind: str
lang: List[str]
force_full_page_ocr: bool = False # If enabled a full page OCR is always applied
bitmap_area_threshold: float = (
0.05 # percentage of the area for a bitmap to processed with OCR
Expand Down

0 comments on commit ed785ea

Please sign in to comment.