From 704d792a7997c4ca34f9f9045ed4ae02b4f5df47 Mon Sep 17 00:00:00 2001 From: Nikos Livathinos <100353117+nikos-livathinos@users.noreply.github.com> Date: Fri, 8 Nov 2024 13:03:09 +0100 Subject: [PATCH] fix(tesserocr): Raise Exception if tesserocr has not loaded any languages (#279) fix(TesseractOcrModel): Raise Exception if tesserocr has not loaded any languages. Provide a descriptive error message. Signed-off-by: Nikos Livathinos --- docling/models/tesseract_ocr_model.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/docling/models/tesseract_ocr_model.py b/docling/models/tesseract_ocr_model.py index 42513239..83f23837 100644 --- a/docling/models/tesseract_ocr_model.py +++ b/docling/models/tesseract_ocr_model.py @@ -22,25 +22,37 @@ def __init__(self, enabled: bool, options: TesseractOcrOptions): self.reader = None if self.enabled: - setup_errmsg = ( + install_errmsg = ( "tesserocr is not correctly installed. " "Please install it via `pip install tesserocr` to use this OCR engine. " - "Note that tesserocr might have to be manually compiled for working with" + "Note that tesserocr might have to be manually compiled for working with " "your Tesseract installation. The Docling documentation provides examples for it. " - "Alternatively, Docling has support for other OCR engines. See the documentation." + "Alternatively, Docling has support for other OCR engines. See the documentation: " + "https://ds4sd.github.io/docling/installation/" ) + missing_langs_errmsg = ( + "tesserocr is not correctly configured. No language models have been detected. " + "Please ensure that the TESSDATA_PREFIX envvar points to tesseract languages dir. " + "You can find more information how to setup other OCR engines in Docling " + "documentation: " + "https://ds4sd.github.io/docling/installation/" + ) + try: import tesserocr except ImportError: - raise ImportError(setup_errmsg) - + raise ImportError(install_errmsg) try: tesseract_version = tesserocr.tesseract_version() - _log.debug("Initializing TesserOCR: %s", tesseract_version) except: - raise ImportError(setup_errmsg) + raise ImportError(install_errmsg) + + _, tesserocr_languages = tesserocr.get_languages() + if not tesserocr_languages: + raise ImportError(missing_langs_errmsg) # Initialize the tesseractAPI + _log.debug("Initializing TesserOCR: %s", tesseract_version) lang = "+".join(self.options.lang) if self.options.path is not None: self.reader = tesserocr.PyTessBaseAPI(