feat: add options for choosing OCR engines (#118)

--------- Signed-off-by: Michele Dolfi <[email protected]> Signed-off-by: Nikos Livathinos <[email protected]> Signed-off-by: Peter Staar <[email protected]> Co-authored-by: Nikos Livathinos <[email protected]> Co-authored-by: Peter Staar <[email protected]>
DS4SD · Oct 8, 2024 · f96ea86 · f96ea86
1 parent d412c36
commit f96ea86
Show file tree

Hide file tree

Showing 20 changed files with 699 additions and 32 deletions.
diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml
@@ -9,6 +9,11 @@ jobs:
         python-version: ['3.10', '3.11', '3.12']
     steps:
       - uses: actions/checkout@v3
+      - name: Install tesseract
+        run: sudo apt-get install -y tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa libleptonica-dev libtesseract-dev pkg-config
+      - name: Set TESSDATA_PREFIX
+        run: |
+          echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"
       - uses: ./.github/actions/setup-poetry
         with:
           python-version: ${{ matrix.python-version }}
@@ -32,4 +37,4 @@ jobs:
             poetry run python "$file" || exit 1
           done
       - name: Build with poetry
-        run: poetry build
+        run: poetry build
diff --git a/README.md b/README.md
@@ -52,6 +52,79 @@ Works on macOS, Linux and Windows environments. Both x86_64 and arm64 architectu
   ```
 </details>
 
+<details>
+  <summary><b>Alternative OCR engines</b></summary>
+
+  Docling supports multiple OCR engines for processing scanned documents. The current version provides
+  the following engines.
+
+  | Engine | Installation | Usage |
+  | ------ | ------------ | ----- |
+  | [EasyOCR](https://github.com/JaidedAI/EasyOCR) | Default in Docling or via `pip install easyocr`. | `EasyOcrOptions` |
+  | Tesseract | System dependency. See description for Tesseract and Tesserocr below.  | `TesseractOcrOptions` |
+  | Tesseract CLI | System dependency. See description below. | `TesseractCliOcrOptions` |
+
+  The Docling `DocumentConverter` allows to choose the OCR engine with the `ocr_options` settings. For example
+
+  ```python
+    from docling.datamodel.base_models import ConversionStatus, PipelineOptions
+    from docling.datamodel.pipeline_options import PipelineOptions, EasyOcrOptions, TesseractOcrOptions
+    from docling.document_converter import DocumentConverter
+
+    pipeline_options = PipelineOptions()
+    pipeline_options.do_ocr = True
+    pipeline_options.ocr_options = TesseractOcrOptions()  # Use Tesseract
+
+    doc_converter = DocumentConverter(
+        pipeline_options=pipeline_options,
+    )
+  ```
+
+  #### Tesseract installation
+
+  [Tesseract](https://github.com/tesseract-ocr/tesseract) is a popular OCR engine which is available
+  on most operating systems. For using this engine with Docling, Tesseract must be installed on your
+  system, using the packaging tool of your choice. Below we provide example commands.
+  After installing Tesseract you are expected to provide the path to its language files using the
+  `TESSDATA_PREFIX` environment variable (note that it must terminate with a slash `/`).
+
+  For macOS, we reccomend using [Homebrew](https://brew.sh/).
+
+  ```console
+  brew install tesseract leptonica pkg-config
+  TESSDATA_PREFIX=/opt/homebrew/share/tessdata/
+  echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
+  ```
+
+  For Debian-based systems.
+
+  ```console
+  apt-get install tesseract-ocr tesseract-ocr-eng libtesseract-dev libleptonica-dev pkg-config
+  TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)
+  echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
+  ```
+
+  For RHEL systems.
+
+  ```console
+  dnf install tesseract tesseract-devel tesseract-langpack-eng leptonica-devel
+  TESSDATA_PREFIX=/usr/share/tesseract/tessdata/
+  echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
+  ```
+
+  #### Linking to Tesseract
+  The most efficient usage of the Tesseract library is via linking. Docling is using
+  the [Tesserocr](https://github.com/sirfz/tesserocr) package for this.
+
+  If you get into installation issues of Tesserocr, we suggest using the following
+  installation options:
+
+  ```console
+  pip uninstall tesserocr
+  pip install --no-binary :all: tesserocr
+  ```
+</details>
+
 <details>
   <summary><b>Docling development setup</b></summary>
 

diff --git a/docling/cli/main.py b/docling/cli/main.py
@@ -14,7 +14,12 @@
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.base_models import ConversionStatus
 from docling.datamodel.document import ConversionResult, DocumentConversionInput
-from docling.datamodel.pipeline_options import PipelineOptions
+from docling.datamodel.pipeline_options import (
+    EasyOcrOptions,
+    PipelineOptions,
+    TesseractCliOcrOptions,
+    TesseractOcrOptions,
+)
 from docling.document_converter import DocumentConverter
 
 warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
@@ -53,6 +58,13 @@ class Backend(str, Enum):
     DOCLING = "docling"
 
 
+# Define an enum for the ocr engines
+class OcrEngine(str, Enum):
+    EASYOCR = "easyocr"
+    TESSERACT_CLI = "tesseract_cli"
+    TESSERACT = "tesseract"
+
+
 def export_documents(
     conv_results: Iterable[ConversionResult],
     output_dir: Path,
@@ -152,6 +164,9 @@ def convert(
     backend: Annotated[
         Backend, typer.Option(..., help="The PDF backend to use.")
     ] = Backend.DOCLING,
+    ocr_engine: Annotated[
+        OcrEngine, typer.Option(..., help="The OCR engine to use.")
+    ] = OcrEngine.EASYOCR,
     output: Annotated[
         Path, typer.Option(..., help="Output directory where results are saved.")
     ] = Path("."),
@@ -191,8 +206,19 @@ def convert(
         case _:
             raise RuntimeError(f"Unexpected backend type {backend}")
 
+    match ocr_engine:
+        case OcrEngine.EASYOCR:
+            ocr_options = EasyOcrOptions()
+        case OcrEngine.TESSERACT_CLI:
+            ocr_options = TesseractCliOcrOptions()
+        case OcrEngine.TESSERACT:
+            ocr_options = TesseractOcrOptions()
+        case _:
+            raise RuntimeError(f"Unexpected backend type {backend}")
+
     pipeline_options = PipelineOptions(
         do_ocr=ocr,
+        ocr_options=ocr_options,
         do_table_structure=True,
     )
     pipeline_options.table_structure_options.do_cell_matching = do_cell_matching

diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py
@@ -110,7 +110,10 @@ def from_tuple(cls, coord: Tuple[float, ...], origin: CoordOrigin):
             return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
 
     def area(self) -> float:
-        return (self.r - self.l) * (self.b - self.t)
+        area = (self.r - self.l) * (self.b - self.t)
+        if self.coord_origin == CoordOrigin.BOTTOMLEFT:
+            area = -area
+        return area
 
     def intersection_area_with(self, other: "BoundingBox") -> float:
         # Calculate intersection coordinates

diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
@@ -1,6 +1,7 @@
 from enum import Enum, auto
+from typing import List, Literal, Optional, Union
 
-from pydantic import BaseModel
+from pydantic import BaseModel, ConfigDict, Field
 
 
 class TableFormerMode(str, Enum):
@@ -18,8 +19,49 @@ class TableStructureOptions(BaseModel):
     mode: TableFormerMode = TableFormerMode.FAST
 
 
+class OcrOptions(BaseModel):
+    kind: str
+
+
+class EasyOcrOptions(OcrOptions):
+    kind: Literal["easyocr"] = "easyocr"
+    lang: List[str] = ["fr", "de", "es", "en"]
+    use_gpu: bool = True  # same default as easyocr.Reader
+    model_storage_directory: Optional[str] = None
+    download_enabled: bool = True  # same default as easyocr.Reader
+
+    model_config = ConfigDict(
+        extra="forbid",
+        protected_namespaces=(),
+    )
+
+
+class TesseractCliOcrOptions(OcrOptions):
+    kind: Literal["tesseract"] = "tesseract"
+    lang: List[str] = ["fra", "deu", "spa", "eng"]
+    tesseract_cmd: str = "tesseract"
+    path: Optional[str] = None
+
+    model_config = ConfigDict(
+        extra="forbid",
+    )
+
+
+class TesseractOcrOptions(OcrOptions):
+    kind: Literal["tesserocr"] = "tesserocr"
+    lang: List[str] = ["fra", "deu", "spa", "eng"]
+    path: Optional[str] = None
+
+    model_config = ConfigDict(
+        extra="forbid",
+    )
+
+
 class PipelineOptions(BaseModel):
     do_table_structure: bool = True  # True: perform table structure extraction
     do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text
 
     table_structure_options: TableStructureOptions = TableStructureOptions()
+    ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions] = (
+        Field(EasyOcrOptions(), discriminator="kind")
+    )
diff --git a/docling/models/base_ocr_model.py b/docling/models/base_ocr_model.py
@@ -3,21 +3,21 @@
 from abc import abstractmethod
 from typing import Iterable, List, Tuple
 
-import numpy
 import numpy as np
 from PIL import Image, ImageDraw
 from rtree import index
 from scipy.ndimage import find_objects, label
 
 from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
+from docling.datamodel.pipeline_options import OcrOptions
 
 _log = logging.getLogger(__name__)
 
 
 class BaseOcrModel:
-    def __init__(self, config):
-        self.config = config
-        self.enabled = config["enabled"]
+    def __init__(self, enabled: bool, options: OcrOptions):
+        self.enabled = enabled
+        self.options = options
 
     # Computes the optimum amount and coordinates of rectangles to OCR on a given page
     def get_ocr_rects(self, page: Page) -> Tuple[bool, List[BoundingBox]]:

diff --git a/docling/models/easyocr_model.py b/docling/models/easyocr_model.py
@@ -4,21 +4,33 @@
 import numpy
 
 from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
+from docling.datamodel.pipeline_options import EasyOcrOptions
 from docling.models.base_ocr_model import BaseOcrModel
 
 _log = logging.getLogger(__name__)
 
 
 class EasyOcrModel(BaseOcrModel):
-    def __init__(self, config):
-        super().__init__(config)
+    def __init__(self, enabled: bool, options: EasyOcrOptions):
+        super().__init__(enabled=enabled, options=options)
+        self.options: EasyOcrOptions
 
         self.scale = 3  # multiplier for 72 dpi == 216 dpi.
 
         if self.enabled:
-            import easyocr
+            try:
+                import easyocr
+            except ImportError:
+                raise ImportError(
+                    "EasyOCR is not installed. Please install it via `pip install easyocr` to use this OCR engine. "
+                    "Alternatively, Docling has support for other OCR engines. See the documentation."
+                )
 
-            self.reader = easyocr.Reader(config["lang"])
+            self.reader = easyocr.Reader(
+                lang_list=self.options.lang,
+                model_storage_directory=self.options.model_storage_directory,
+                download_enabled=self.options.download_enabled,
+            )
 
     def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
 
@@ -31,6 +43,9 @@ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
 
             all_ocr_cells = []
             for ocr_rect in ocr_rects:
+                # Skip zero area boxes
+                if ocr_rect.area() == 0:
+                    continue
                 high_res_image = page._backend.get_page_image(
                     scale=self.scale, cropbox=ocr_rect
                 )