feat: add support for ocrmac OCR engine on macOS (#276)

* feat: add support for `ocrmac` OCR engine on macOS - Integrates `ocrmac` as an OCR engine option for macOS users. - Adds configuration options and dependencies for `ocrmac`. - Updates documentation to reflect new engine support. This change allows macOS users to utilize `ocrmac` for improved OCR performance and compatibility. Signed-off-by: Suhwan Seo <[email protected]> * updated the poetry lock Signed-off-by: Suhwan Seo <[email protected]> * Fix linting issues, update CLI docs, and add error for ocrmac use on non-Mac systems - Resolved formatting and linting issues - Updated `--ocr-engine` CLI option documentation for `ocrmac` - Added RuntimeError for attempts to use `ocrmac` on non-Mac platforms Signed-off-by: Suhwan Seo <[email protected]> * feat: add support for `ocrmac` OCR engine on macOS - Integrates `ocrmac` as an OCR engine option for macOS users. - Adds configuration options and dependencies for `ocrmac`. - Updates documentation to reflect new engine support. This change allows macOS users to utilize `ocrmac` for improved OCR performance and compatibility. Signed-off-by: Suhwan Seo <[email protected]> * docs: update examples and installation for ocrmac support - Added `OcrMacOptions` to `custom_convert.py` and `full_page_ocr.py` examples. - Included usage comments and examples for `OcrMacOptions` in OCR pipelines. - Updated installation guide to include instructions for installing `ocrmac`, noting macOS version requirements (10.15+). - Highlighted that `ocrmac` leverages Apple's Vision framework as an OCR backend. This enhances documentation for users working on macOS to leverage `ocrmac` effectively. Signed-off-by: Suhwan Seo <[email protected]> * fix: update `ocrmac` dependency with macOS-specific marker - Added `sys_platform == 'darwin'` marker to the `ocrmac` dependency in `pyproject.toml` to specify macOS compatibility. - Updated the content hash in `poetry.lock` to reflect the changes. This ensures the `ocrmac` dependency is only installed on macOS systems. Signed-off-by: Suhwan Seo <[email protected]> --------- Signed-off-by: Suhwan Seo <[email protected]> Co-authored-by: Suhwan Seo <[email protected]>
DS4SD · Nov 20, 2024 · 6efa96c · 6efa96c
1 parent 32ebf55
commit 6efa96c
Show file tree

Hide file tree

Showing 10 changed files with 311 additions and 14 deletions.
diff --git a/docling/cli/main.py b/docling/cli/main.py
@@ -24,6 +24,7 @@
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
     EasyOcrOptions,
+    OcrMacOptions,
     OcrOptions,
     PdfPipelineOptions,
     TableFormerMode,
@@ -74,6 +75,7 @@ class OcrEngine(str, Enum):
     EASYOCR = "easyocr"
     TESSERACT_CLI = "tesseract_cli"
     TESSERACT = "tesseract"
+    OCRMAC = "ocrmac"
 
 
 def export_documents(
@@ -259,6 +261,8 @@ def convert(
             ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
         case OcrEngine.TESSERACT:
             ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
+        case OcrEngine.OCRMAC:
+            ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
         case _:
             raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
 

diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
@@ -63,6 +63,17 @@ class TesseractOcrOptions(OcrOptions):
     )
 
 
+class OcrMacOptions(OcrOptions):
+    kind: Literal["ocrmac"] = "ocrmac"
+    lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
+    recognition: str = "accurate"
+    framework: str = "vision"
+
+    model_config = ConfigDict(
+        extra="forbid",
+    )
+
+
 class PipelineOptions(BaseModel):
     create_legacy_output: bool = (
         True  # This defautl will be set to False on a future version of docling
@@ -75,9 +86,9 @@ class PdfPipelineOptions(PipelineOptions):
     do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text
 
     table_structure_options: TableStructureOptions = TableStructureOptions()
-    ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions] = (
-        Field(EasyOcrOptions(), discriminator="kind")
-    )
+    ocr_options: Union[
+        EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, OcrMacOptions
+    ] = Field(EasyOcrOptions(), discriminator="kind")
 
     images_scale: float = 1.0
     generate_page_images: bool = False

diff --git a/docling/models/ocr_mac_model.py b/docling/models/ocr_mac_model.py
@@ -0,0 +1,118 @@
+import logging
+import tempfile
+from typing import Iterable, Optional, Tuple
+
+from docling_core.types.doc import BoundingBox, CoordOrigin
+
+from docling.datamodel.base_models import OcrCell, Page
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options import OcrMacOptions
+from docling.datamodel.settings import settings
+from docling.models.base_ocr_model import BaseOcrModel
+from docling.utils.profiling import TimeRecorder
+
+_log = logging.getLogger(__name__)
+
+
+class OcrMacModel(BaseOcrModel):
+    def __init__(self, enabled: bool, options: OcrMacOptions):
+        super().__init__(enabled=enabled, options=options)
+        self.options: OcrMacOptions
+
+        self.scale = 3  # multiplier for 72 dpi == 216 dpi.
+
+        if self.enabled:
+            install_errmsg = (
+                "ocrmac is not correctly installed. "
+                "Please install it via `pip install ocrmac` to use this OCR engine. "
+                "Alternatively, Docling has support for other OCR engines. See the documentation: "
+                "https://ds4sd.github.io/docling/installation/"
+            )
+            try:
+                from ocrmac import ocrmac
+            except ImportError:
+                raise ImportError(install_errmsg)
+
+            self.reader_RIL = ocrmac.OCR
+
+    def __call__(
+        self, conv_res: ConversionResult, page_batch: Iterable[Page]
+    ) -> Iterable[Page]:
+
+        if not self.enabled:
+            yield from page_batch
+            return
+
+        for page in page_batch:
+            assert page._backend is not None
+            if not page._backend.is_valid():
+                yield page
+            else:
+                with TimeRecorder(conv_res, "ocr"):
+
+                    ocr_rects = self.get_ocr_rects(page)
+
+                    all_ocr_cells = []
+                    for ocr_rect in ocr_rects:
+                        # Skip zero area boxes
+                        if ocr_rect.area() == 0:
+                            continue
+                        high_res_image = page._backend.get_page_image(
+                            scale=self.scale, cropbox=ocr_rect
+                        )
+
+                        with tempfile.NamedTemporaryFile(
+                            suffix=".png", mode="w"
+                        ) as image_file:
+                            fname = image_file.name
+                            high_res_image.save(fname)
+
+                            boxes = self.reader_RIL(
+                                fname,
+                                recognition_level=self.options.recognition,
+                                framework=self.options.framework,
+                                language_preference=self.options.lang,
+                            ).recognize()
+
+                        im_width, im_height = high_res_image.size
+                        cells = []
+                        for ix, (text, confidence, box) in enumerate(boxes):
+                            x = float(box[0])
+                            y = float(box[1])
+                            w = float(box[2])
+                            h = float(box[3])
+
+                            x1 = x * im_width
+                            y2 = (1 - y) * im_height
+
+                            x2 = x1 + w * im_width
+                            y1 = y2 - h * im_height
+
+                            left = x1 / self.scale
+                            top = y1 / self.scale
+                            right = x2 / self.scale
+                            bottom = y2 / self.scale
+
+                            cells.append(
+                                OcrCell(
+                                    id=ix,
+                                    text=text,
+                                    confidence=confidence,
+                                    bbox=BoundingBox.from_tuple(
+                                        coord=(left, top, right, bottom),
+                                        origin=CoordOrigin.TOPLEFT,
+                                    ),
+                                )
+                            )
+
+                        # del high_res_image
+                        all_ocr_cells.extend(cells)
+
+                    # Post-process the cells
+                    page.cells = self.post_process_cells(all_ocr_cells, page.cells)
+
+                # DEBUG code:
+                if settings.debug.visualize_ocr:
+                    self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
+
+                yield page
diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py
@@ -1,4 +1,5 @@
 import logging
+import sys
 from pathlib import Path
 from typing import Optional
 
@@ -10,6 +11,7 @@
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
     EasyOcrOptions,
+    OcrMacOptions,
     PdfPipelineOptions,
     TesseractCliOcrOptions,
     TesseractOcrOptions,
@@ -18,6 +20,7 @@
 from docling.models.ds_glm_model import GlmModel, GlmOptions
 from docling.models.easyocr_model import EasyOcrModel
 from docling.models.layout_model import LayoutModel
+from docling.models.ocr_mac_model import OcrMacModel
 from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
 from docling.models.page_preprocessing_model import (
     PagePreprocessingModel,
@@ -118,6 +121,15 @@ def get_ocr_model(self) -> Optional[BaseOcrModel]:
                 enabled=self.pipeline_options.do_ocr,
                 options=self.pipeline_options.ocr_options,
             )
+        elif isinstance(self.pipeline_options.ocr_options, OcrMacOptions):
+            if "darwin" != sys.platform:
+                raise RuntimeError(
+                    f"The specified OCR type is only supported on Mac: {self.pipeline_options.ocr_options.kind}."
+                )
+            return OcrMacModel(
+                enabled=self.pipeline_options.do_ocr,
+                options=self.pipeline_options.ocr_options,
+            )
         return None
 
     def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:

diff --git a/docs/examples/custom_convert.py b/docs/examples/custom_convert.py
@@ -7,6 +7,7 @@
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling.document_converter import DocumentConverter, PdfFormatOption
+from docling.models.ocr_mac_model import OcrMacOptions
 from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
 from docling.models.tesseract_ocr_model import TesseractOcrOptions
 
@@ -122,6 +123,20 @@ def main():
     #     }
     # )
 
+    # Docling Parse with ocrmac(Mac only)
+    # ----------------------
+    # pipeline_options = PdfPipelineOptions()
+    # pipeline_options.do_ocr = True
+    # pipeline_options.do_table_structure = True
+    # pipeline_options.table_structure_options.do_cell_matching = True
+    # pipeline_options.ocr_options = OcrMacOptions()
+
+    # doc_converter = DocumentConverter(
+    #     format_options={
+    #         InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
+    #     }
+    # )
+
     ###########################################################################
 
     start_time = time.time()

diff --git a/docs/examples/full_page_ocr.py b/docs/examples/full_page_ocr.py
@@ -4,6 +4,7 @@
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import (
     EasyOcrOptions,
+    OcrMacOptions,
     PdfPipelineOptions,
     TesseractCliOcrOptions,
     TesseractOcrOptions,
@@ -19,9 +20,10 @@ def main():
     pipeline_options.do_table_structure = True
     pipeline_options.table_structure_options.do_cell_matching = True
 
-    # Any of the OCR options can be used:EasyOcrOptions, TesseractOcrOptions, TesseractCliOcrOptions
+    # Any of the OCR options can be used:EasyOcrOptions, TesseractOcrOptions, TesseractCliOcrOptions, OcrMacOptions(Mac only)
     # ocr_options = EasyOcrOptions(force_full_page_ocr=True)
     # ocr_options = TesseractOcrOptions(force_full_page_ocr=True)
+    # ocr_options = OcrMacOptions(force_full_page_ocr=True)
     ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True)
     pipeline_options.ocr_options = ocr_options
 

diff --git a/docs/installation.md b/docs/installation.md
@@ -30,6 +30,7 @@ Works on macOS, Linux, and Windows, with support for both x86_64 and arm64 archi
     | [EasyOCR](https://github.com/JaidedAI/EasyOCR) | Default in Docling or via `pip install easyocr`. | `EasyOcrOptions` |
     | Tesseract | System dependency. See description for Tesseract and Tesserocr below.  | `TesseractOcrOptions` |
     | Tesseract CLI | System dependency. See description below. | `TesseractCliOcrOptions` |
+    | OcrMac | System dependency. See description below. | `OcrMacOptions` |
 
     The Docling `DocumentConverter` allows to choose the OCR engine with the `ocr_options` settings. For example
 
@@ -91,6 +92,17 @@ Works on macOS, Linux, and Windows, with support for both x86_64 and arm64 archi
     pip install --no-binary :all: tesserocr
     ```
 
+    <h3>ocrmac installation</h3>
+
+    [ocrmac](https://github.com/straussmaximilian/ocrmac) is using
+    Apple's vision(or livetext) framework as OCR backend.
+    For using this engine with Docling, ocrmac must be installed on your system.
+    This only works on macOS systems with newer macOS versions (10.15+).
+
+    ```console
+    pip install ocrmac
+    ```
+
 ## Development setup
 
 To develop Docling features, bugfixes etc., install as follows from your local clone's root dir: