-
Notifications
You must be signed in to change notification settings - Fork 1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add support for
ocrmac
OCR engine on macOS (#276)
* feat: add support for `ocrmac` OCR engine on macOS - Integrates `ocrmac` as an OCR engine option for macOS users. - Adds configuration options and dependencies for `ocrmac`. - Updates documentation to reflect new engine support. This change allows macOS users to utilize `ocrmac` for improved OCR performance and compatibility. Signed-off-by: Suhwan Seo <[email protected]> * updated the poetry lock Signed-off-by: Suhwan Seo <[email protected]> * Fix linting issues, update CLI docs, and add error for ocrmac use on non-Mac systems - Resolved formatting and linting issues - Updated `--ocr-engine` CLI option documentation for `ocrmac` - Added RuntimeError for attempts to use `ocrmac` on non-Mac platforms Signed-off-by: Suhwan Seo <[email protected]> * feat: add support for `ocrmac` OCR engine on macOS - Integrates `ocrmac` as an OCR engine option for macOS users. - Adds configuration options and dependencies for `ocrmac`. - Updates documentation to reflect new engine support. This change allows macOS users to utilize `ocrmac` for improved OCR performance and compatibility. Signed-off-by: Suhwan Seo <[email protected]> * docs: update examples and installation for ocrmac support - Added `OcrMacOptions` to `custom_convert.py` and `full_page_ocr.py` examples. - Included usage comments and examples for `OcrMacOptions` in OCR pipelines. - Updated installation guide to include instructions for installing `ocrmac`, noting macOS version requirements (10.15+). - Highlighted that `ocrmac` leverages Apple's Vision framework as an OCR backend. This enhances documentation for users working on macOS to leverage `ocrmac` effectively. Signed-off-by: Suhwan Seo <[email protected]> * fix: update `ocrmac` dependency with macOS-specific marker - Added `sys_platform == 'darwin'` marker to the `ocrmac` dependency in `pyproject.toml` to specify macOS compatibility. - Updated the content hash in `poetry.lock` to reflect the changes. This ensures the `ocrmac` dependency is only installed on macOS systems. Signed-off-by: Suhwan Seo <[email protected]> --------- Signed-off-by: Suhwan Seo <[email protected]> Co-authored-by: Suhwan Seo <[email protected]>
- Loading branch information
Showing
10 changed files
with
311 additions
and
14 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
import logging | ||
import tempfile | ||
from typing import Iterable, Optional, Tuple | ||
|
||
from docling_core.types.doc import BoundingBox, CoordOrigin | ||
|
||
from docling.datamodel.base_models import OcrCell, Page | ||
from docling.datamodel.document import ConversionResult | ||
from docling.datamodel.pipeline_options import OcrMacOptions | ||
from docling.datamodel.settings import settings | ||
from docling.models.base_ocr_model import BaseOcrModel | ||
from docling.utils.profiling import TimeRecorder | ||
|
||
_log = logging.getLogger(__name__) | ||
|
||
|
||
class OcrMacModel(BaseOcrModel): | ||
def __init__(self, enabled: bool, options: OcrMacOptions): | ||
super().__init__(enabled=enabled, options=options) | ||
self.options: OcrMacOptions | ||
|
||
self.scale = 3 # multiplier for 72 dpi == 216 dpi. | ||
|
||
if self.enabled: | ||
install_errmsg = ( | ||
"ocrmac is not correctly installed. " | ||
"Please install it via `pip install ocrmac` to use this OCR engine. " | ||
"Alternatively, Docling has support for other OCR engines. See the documentation: " | ||
"https://ds4sd.github.io/docling/installation/" | ||
) | ||
try: | ||
from ocrmac import ocrmac | ||
except ImportError: | ||
raise ImportError(install_errmsg) | ||
|
||
self.reader_RIL = ocrmac.OCR | ||
|
||
def __call__( | ||
self, conv_res: ConversionResult, page_batch: Iterable[Page] | ||
) -> Iterable[Page]: | ||
|
||
if not self.enabled: | ||
yield from page_batch | ||
return | ||
|
||
for page in page_batch: | ||
assert page._backend is not None | ||
if not page._backend.is_valid(): | ||
yield page | ||
else: | ||
with TimeRecorder(conv_res, "ocr"): | ||
|
||
ocr_rects = self.get_ocr_rects(page) | ||
|
||
all_ocr_cells = [] | ||
for ocr_rect in ocr_rects: | ||
# Skip zero area boxes | ||
if ocr_rect.area() == 0: | ||
continue | ||
high_res_image = page._backend.get_page_image( | ||
scale=self.scale, cropbox=ocr_rect | ||
) | ||
|
||
with tempfile.NamedTemporaryFile( | ||
suffix=".png", mode="w" | ||
) as image_file: | ||
fname = image_file.name | ||
high_res_image.save(fname) | ||
|
||
boxes = self.reader_RIL( | ||
fname, | ||
recognition_level=self.options.recognition, | ||
framework=self.options.framework, | ||
language_preference=self.options.lang, | ||
).recognize() | ||
|
||
im_width, im_height = high_res_image.size | ||
cells = [] | ||
for ix, (text, confidence, box) in enumerate(boxes): | ||
x = float(box[0]) | ||
y = float(box[1]) | ||
w = float(box[2]) | ||
h = float(box[3]) | ||
|
||
x1 = x * im_width | ||
y2 = (1 - y) * im_height | ||
|
||
x2 = x1 + w * im_width | ||
y1 = y2 - h * im_height | ||
|
||
left = x1 / self.scale | ||
top = y1 / self.scale | ||
right = x2 / self.scale | ||
bottom = y2 / self.scale | ||
|
||
cells.append( | ||
OcrCell( | ||
id=ix, | ||
text=text, | ||
confidence=confidence, | ||
bbox=BoundingBox.from_tuple( | ||
coord=(left, top, right, bottom), | ||
origin=CoordOrigin.TOPLEFT, | ||
), | ||
) | ||
) | ||
|
||
# del high_res_image | ||
all_ocr_cells.extend(cells) | ||
|
||
# Post-process the cells | ||
page.cells = self.post_process_cells(all_ocr_cells, page.cells) | ||
|
||
# DEBUG code: | ||
if settings.debug.visualize_ocr: | ||
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects) | ||
|
||
yield page |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.