From 3be2fb581fe5a2ebd5cec9c86bb22eb1dec6fd0f Mon Sep 17 00:00:00 2001 From: Nikos Livathinos <100353117+nikos-livathinos@users.noreply.github.com> Date: Sun, 26 Jan 2025 08:07:56 +0100 Subject: [PATCH] feat: Introduce automatic language detection in TesseractOcrCliModel (#800) * feat: Introduce automatic language detection in tesseract_ocr_cli model. Extend unit tests. Signed-off-by: Nikos Livathinos * docs: Add example how to use "auto" language with tesseract OCR engines Signed-off-by: Nikos Livathinos * fix: Refactor the TesseractOcrModel and TesseractOcrCliModel to validate if the auto-detected language is installed in the system and if not fall back to a default option without language. Signed-off-by: Nikos Livathinos --------- Signed-off-by: Nikos Livathinos --- docling/models/tesseract_ocr_cli_model.py | 76 +++++++++++++++++++++-- docling/models/tesseract_ocr_model.py | 74 +++++++++++----------- docling/utils/ocr_utils.py | 9 +++ docs/examples/tesseract_lang_detection.py | 37 +++++++++++ mkdocs.yml | 1 + tests/test_e2e_ocr_conversion.py | 1 + 6 files changed, 157 insertions(+), 41 deletions(-) create mode 100644 docling/utils/ocr_utils.py create mode 100644 docs/examples/tesseract_lang_detection.py diff --git a/docling/models/tesseract_ocr_cli_model.py b/docling/models/tesseract_ocr_cli_model.py index 3d5c8006..cdc5671d 100644 --- a/docling/models/tesseract_ocr_cli_model.py +++ b/docling/models/tesseract_ocr_cli_model.py @@ -4,7 +4,7 @@ import os import tempfile from subprocess import DEVNULL, PIPE, Popen -from typing import Iterable, Optional, Tuple +from typing import Iterable, List, Optional, Tuple import pandas as pd from docling_core.types.doc import BoundingBox, CoordOrigin @@ -14,6 +14,7 @@ from docling.datamodel.pipeline_options import TesseractCliOcrOptions from docling.datamodel.settings import settings from docling.models.base_ocr_model import BaseOcrModel +from docling.utils.ocr_utils import map_tesseract_script from docling.utils.profiling import TimeRecorder _log = logging.getLogger(__name__) @@ -28,10 +29,13 @@ def __init__(self, enabled: bool, options: TesseractCliOcrOptions): self._name: Optional[str] = None self._version: Optional[str] = None + self._tesseract_languages: Optional[List[str]] = None + self._script_prefix: Optional[str] = None if self.enabled: try: self._get_name_and_version() + self._set_languages_and_prefix() except Exception as exc: raise RuntimeError( @@ -73,12 +77,20 @@ def _get_name_and_version(self) -> Tuple[str, str]: return name, version def _run_tesseract(self, ifilename: str): - + r""" + Run tesseract CLI + """ cmd = [self.options.tesseract_cmd] - if self.options.lang is not None and len(self.options.lang) > 0: + if "auto" in self.options.lang: + lang = self._detect_language(ifilename) + if lang is not None: + cmd.append("-l") + cmd.append(lang) + elif self.options.lang is not None and len(self.options.lang) > 0: cmd.append("-l") cmd.append("+".join(self.options.lang)) + if self.options.path is not None: cmd.append("--tessdata-dir") cmd.append(self.options.path) @@ -106,6 +118,63 @@ def _run_tesseract(self, ifilename: str): return df_filtered + def _detect_language(self, ifilename: str): + r""" + Run tesseract in PSM 0 mode to detect the language + """ + assert self._tesseract_languages is not None + + cmd = [self.options.tesseract_cmd] + cmd.extend(["--psm", "0", "-l", "osd", ifilename, "stdout"]) + _log.info("command: {}".format(" ".join(cmd))) + proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL) + output, _ = proc.communicate() + decoded_data = output.decode("utf-8") + df = pd.read_csv( + io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"] + ) + scripts = df.loc[df["key"] == "Script"].value.tolist() + if len(scripts) == 0: + _log.warning("Tesseract cannot detect the script of the page") + return None + + script = map_tesseract_script(scripts[0].strip()) + lang = f"{self._script_prefix}{script}" + + # Check if the detected language has been installed + if lang not in self._tesseract_languages: + msg = f"Tesseract detected the script '{script}' and language '{lang}'." + msg += " However this language is not installed in your system and will be ignored." + _log.warning(msg) + return None + + _log.debug( + f"Using tesseract model for the detected script '{script}' and language '{lang}'" + ) + return lang + + def _set_languages_and_prefix(self): + r""" + Read and set the languages installed in tesseract and decide the script prefix + """ + # Get all languages + cmd = [self.options.tesseract_cmd] + cmd.append("--list-langs") + _log.info("command: {}".format(" ".join(cmd))) + proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL) + output, _ = proc.communicate() + decoded_data = output.decode("utf-8") + df = pd.read_csv(io.StringIO(decoded_data), header=None) + self._tesseract_languages = df[0].tolist()[1:] + + # Decide the script prefix + if any([l.startswith("script/") for l in self._tesseract_languages]): + script_prefix = "script/" + else: + script_prefix = "" + + self._script_prefix = script_prefix + def __call__( self, conv_res: ConversionResult, page_batch: Iterable[Page] ) -> Iterable[Page]: @@ -120,7 +189,6 @@ def __call__( yield page else: with TimeRecorder(conv_res, "ocr"): - ocr_rects = self.get_ocr_rects(page) all_ocr_cells = [] diff --git a/docling/models/tesseract_ocr_model.py b/docling/models/tesseract_ocr_model.py index 6a1b60ee..5b70155e 100644 --- a/docling/models/tesseract_ocr_model.py +++ b/docling/models/tesseract_ocr_model.py @@ -8,6 +8,7 @@ from docling.datamodel.pipeline_options import TesseractOcrOptions from docling.datamodel.settings import settings from docling.models.base_ocr_model import BaseOcrModel +from docling.utils.ocr_utils import map_tesseract_script from docling.utils.profiling import TimeRecorder _log = logging.getLogger(__name__) @@ -20,6 +21,7 @@ def __init__(self, enabled: bool, options: TesseractOcrOptions): self.scale = 3 # multiplier for 72 dpi == 216 dpi. self.reader = None + self.osd_reader = None if self.enabled: install_errmsg = ( @@ -47,8 +49,8 @@ def __init__(self, enabled: bool, options: TesseractOcrOptions): except: raise ImportError(install_errmsg) - _, tesserocr_languages = tesserocr.get_languages() - if not tesserocr_languages: + _, self._tesserocr_languages = tesserocr.get_languages() + if not self._tesserocr_languages: raise ImportError(missing_langs_errmsg) # Initialize the tesseractAPI @@ -57,7 +59,7 @@ def __init__(self, enabled: bool, options: TesseractOcrOptions): self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {} - if any([l.startswith("script/") for l in tesserocr_languages]): + if any([l.startswith("script/") for l in self._tesserocr_languages]): self.script_prefix = "script/" else: self.script_prefix = "" @@ -72,14 +74,14 @@ def __init__(self, enabled: bool, options: TesseractOcrOptions): tesserocr_kwargs["path"] = self.options.path if lang == "auto": - self.reader = tesserocr.PyTessBaseAPI( + self.reader = tesserocr.PyTessBaseAPI(**tesserocr_kwargs) + self.osd_reader = tesserocr.PyTessBaseAPI( **{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs ) else: self.reader = tesserocr.PyTessBaseAPI( **{"lang": lang} | tesserocr_kwargs, ) - self.reader_RIL = tesserocr.RIL def __del__(self): @@ -96,8 +98,6 @@ def __call__( yield from page_batch return - import tesserocr - for page in page_batch: assert page._backend is not None if not page._backend.is_valid(): @@ -105,6 +105,7 @@ def __call__( else: with TimeRecorder(conv_res, "ocr"): assert self.reader is not None + assert self._tesserocr_languages is not None ocr_rects = self.get_ocr_rects(page) @@ -117,43 +118,42 @@ def __call__( scale=self.scale, cropbox=ocr_rect ) - # Retrieve text snippets with their bounding boxes - self.reader.SetImage(high_res_image) + local_reader = self.reader + if "auto" in self.options.lang: + assert self.osd_reader is not None - if self.options.lang == ["auto"]: - osd = self.reader.DetectOrientationScript() + self.osd_reader.SetImage(high_res_image) + osd = self.osd_reader.DetectOrientationScript() # No text, probably if osd is None: continue script = osd["script_name"] - - if script == "Katakana" or script == "Hiragana": - script = "Japanese" - elif script == "Han": - script = "HanS" - elif script == "Korean": - script = "Hangul" - - _log.debug( - f'Using model for the detected script "{script}"' - ) - - if script not in self.script_readers: - self.script_readers[script] = tesserocr.PyTessBaseAPI( - path=self.reader.GetDatapath(), - lang=f"{self.script_prefix}{script}", - psm=tesserocr.PSM.AUTO, - init=True, - oem=tesserocr.OEM.DEFAULT, - ) - - local_reader = self.script_readers[script] - local_reader.SetImage(high_res_image) - else: - local_reader = self.reader - + script = map_tesseract_script(script) + lang = f"{self.script_prefix}{script}" + + # Check if the detected languge is present in the system + if lang not in self._tesserocr_languages: + msg = f"Tesseract detected the script '{script}' and language '{lang}'." + msg += " However this language is not installed in your system and will be ignored." + _log.warning(msg) + else: + if script not in self.script_readers: + import tesserocr + + self.script_readers[script] = ( + tesserocr.PyTessBaseAPI( + path=self.reader.GetDatapath(), + lang=lang, + psm=tesserocr.PSM.AUTO, + init=True, + oem=tesserocr.OEM.DEFAULT, + ) + ) + local_reader = self.script_readers[script] + + local_reader.SetImage(high_res_image) boxes = local_reader.GetComponentImages( self.reader_RIL.TEXTLINE, True ) diff --git a/docling/utils/ocr_utils.py b/docling/utils/ocr_utils.py new file mode 100644 index 00000000..59503f1f --- /dev/null +++ b/docling/utils/ocr_utils.py @@ -0,0 +1,9 @@ +def map_tesseract_script(script: str) -> str: + r""" """ + if script == "Katakana" or script == "Hiragana": + script = "Japanese" + elif script == "Han": + script = "HanS" + elif script == "Korean": + script = "Hangul" + return script diff --git a/docs/examples/tesseract_lang_detection.py b/docs/examples/tesseract_lang_detection.py new file mode 100644 index 00000000..b75e4707 --- /dev/null +++ b/docs/examples/tesseract_lang_detection.py @@ -0,0 +1,37 @@ +from pathlib import Path + +from docling.datamodel.base_models import InputFormat +from docling.datamodel.pipeline_options import ( + PdfPipelineOptions, + TesseractCliOcrOptions, + TesseractOcrOptions, +) +from docling.document_converter import DocumentConverter, PdfFormatOption + + +def main(): + input_doc = Path("./tests/data/2206.01062.pdf") + + # Set lang=["auto"] with a tesseract OCR engine: TesseractOcrOptions, TesseractCliOcrOptions + # ocr_options = TesseractOcrOptions(lang=["auto"]) + ocr_options = TesseractCliOcrOptions(lang=["auto"]) + + pipeline_options = PdfPipelineOptions( + do_ocr=True, force_full_page_ocr=True, ocr_options=ocr_options + ) + + converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_options=pipeline_options, + ) + } + ) + + doc = converter.convert(input_doc).document + md = doc.export_to_markdown() + print(md) + + +if __name__ == "__main__": + main() diff --git a/mkdocs.yml b/mkdocs.yml index 0f3e9dd0..2b2e2da0 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -75,6 +75,7 @@ nav: - "Table export": examples/export_tables.py - "Multimodal export": examples/export_multimodal.py - "Force full page OCR": examples/full_page_ocr.py + - "Automatic OCR language detection with tesseract": examples/tesseract_lang_detection.py - "Accelerator options": examples/run_with_accelerator.py - "Simple translation": examples/translate.py - ✂️ Chunking: diff --git a/tests/test_e2e_ocr_conversion.py b/tests/test_e2e_ocr_conversion.py index b3cdd312..4a542d21 100644 --- a/tests/test_e2e_ocr_conversion.py +++ b/tests/test_e2e_ocr_conversion.py @@ -62,6 +62,7 @@ def test_e2e_conversions(): TesseractOcrOptions(force_full_page_ocr=True), TesseractOcrOptions(force_full_page_ocr=True, lang=["auto"]), TesseractCliOcrOptions(force_full_page_ocr=True), + TesseractCliOcrOptions(force_full_page_ocr=True, lang=["auto"]), RapidOcrOptions(force_full_page_ocr=True), ]