From 6efa96c983fc509b2c7b35a4a25a714284f2f782 Mon Sep 17 00:00:00 2001 From: nuridol Date: Wed, 20 Nov 2024 20:51:19 +0900 Subject: [PATCH] feat: add support for `ocrmac` OCR engine on macOS (#276) * feat: add support for `ocrmac` OCR engine on macOS - Integrates `ocrmac` as an OCR engine option for macOS users. - Adds configuration options and dependencies for `ocrmac`. - Updates documentation to reflect new engine support. This change allows macOS users to utilize `ocrmac` for improved OCR performance and compatibility. Signed-off-by: Suhwan Seo * updated the poetry lock Signed-off-by: Suhwan Seo * Fix linting issues, update CLI docs, and add error for ocrmac use on non-Mac systems - Resolved formatting and linting issues - Updated `--ocr-engine` CLI option documentation for `ocrmac` - Added RuntimeError for attempts to use `ocrmac` on non-Mac platforms Signed-off-by: Suhwan Seo * feat: add support for `ocrmac` OCR engine on macOS - Integrates `ocrmac` as an OCR engine option for macOS users. - Adds configuration options and dependencies for `ocrmac`. - Updates documentation to reflect new engine support. This change allows macOS users to utilize `ocrmac` for improved OCR performance and compatibility. Signed-off-by: Suhwan Seo * docs: update examples and installation for ocrmac support - Added `OcrMacOptions` to `custom_convert.py` and `full_page_ocr.py` examples. - Included usage comments and examples for `OcrMacOptions` in OCR pipelines. - Updated installation guide to include instructions for installing `ocrmac`, noting macOS version requirements (10.15+). - Highlighted that `ocrmac` leverages Apple's Vision framework as an OCR backend. This enhances documentation for users working on macOS to leverage `ocrmac` effectively. Signed-off-by: Suhwan Seo * fix: update `ocrmac` dependency with macOS-specific marker - Added `sys_platform == 'darwin'` marker to the `ocrmac` dependency in `pyproject.toml` to specify macOS compatibility. - Updated the content hash in `poetry.lock` to reflect the changes. This ensures the `ocrmac` dependency is only installed on macOS systems. Signed-off-by: Suhwan Seo --------- Signed-off-by: Suhwan Seo Co-authored-by: Suhwan Seo --- docling/cli/main.py | 4 + docling/datamodel/pipeline_options.py | 17 ++- docling/models/ocr_mac_model.py | 118 +++++++++++++++++++ docling/pipeline/standard_pdf_pipeline.py | 12 ++ docs/examples/custom_convert.py | 15 +++ docs/examples/full_page_ocr.py | 4 +- docs/installation.md | 12 ++ poetry.lock | 133 ++++++++++++++++++++-- pyproject.toml | 3 + tests/test_e2e_ocr_conversion.py | 7 ++ 10 files changed, 311 insertions(+), 14 deletions(-) create mode 100644 docling/models/ocr_mac_model.py diff --git a/docling/cli/main.py b/docling/cli/main.py index a2a86bf4..8e0d23c1 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -24,6 +24,7 @@ from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( EasyOcrOptions, + OcrMacOptions, OcrOptions, PdfPipelineOptions, TableFormerMode, @@ -74,6 +75,7 @@ class OcrEngine(str, Enum): EASYOCR = "easyocr" TESSERACT_CLI = "tesseract_cli" TESSERACT = "tesseract" + OCRMAC = "ocrmac" def export_documents( @@ -259,6 +261,8 @@ def convert( ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr) case OcrEngine.TESSERACT: ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr) + case OcrEngine.OCRMAC: + ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr) case _: raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}") diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 1ea4d62a..b691215c 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -63,6 +63,17 @@ class TesseractOcrOptions(OcrOptions): ) +class OcrMacOptions(OcrOptions): + kind: Literal["ocrmac"] = "ocrmac" + lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"] + recognition: str = "accurate" + framework: str = "vision" + + model_config = ConfigDict( + extra="forbid", + ) + + class PipelineOptions(BaseModel): create_legacy_output: bool = ( True # This defautl will be set to False on a future version of docling @@ -75,9 +86,9 @@ class PdfPipelineOptions(PipelineOptions): do_ocr: bool = True # True: perform OCR, replace programmatic PDF text table_structure_options: TableStructureOptions = TableStructureOptions() - ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions] = ( - Field(EasyOcrOptions(), discriminator="kind") - ) + ocr_options: Union[ + EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, OcrMacOptions + ] = Field(EasyOcrOptions(), discriminator="kind") images_scale: float = 1.0 generate_page_images: bool = False diff --git a/docling/models/ocr_mac_model.py b/docling/models/ocr_mac_model.py new file mode 100644 index 00000000..38bcf1ca --- /dev/null +++ b/docling/models/ocr_mac_model.py @@ -0,0 +1,118 @@ +import logging +import tempfile +from typing import Iterable, Optional, Tuple + +from docling_core.types.doc import BoundingBox, CoordOrigin + +from docling.datamodel.base_models import OcrCell, Page +from docling.datamodel.document import ConversionResult +from docling.datamodel.pipeline_options import OcrMacOptions +from docling.datamodel.settings import settings +from docling.models.base_ocr_model import BaseOcrModel +from docling.utils.profiling import TimeRecorder + +_log = logging.getLogger(__name__) + + +class OcrMacModel(BaseOcrModel): + def __init__(self, enabled: bool, options: OcrMacOptions): + super().__init__(enabled=enabled, options=options) + self.options: OcrMacOptions + + self.scale = 3 # multiplier for 72 dpi == 216 dpi. + + if self.enabled: + install_errmsg = ( + "ocrmac is not correctly installed. " + "Please install it via `pip install ocrmac` to use this OCR engine. " + "Alternatively, Docling has support for other OCR engines. See the documentation: " + "https://ds4sd.github.io/docling/installation/" + ) + try: + from ocrmac import ocrmac + except ImportError: + raise ImportError(install_errmsg) + + self.reader_RIL = ocrmac.OCR + + def __call__( + self, conv_res: ConversionResult, page_batch: Iterable[Page] + ) -> Iterable[Page]: + + if not self.enabled: + yield from page_batch + return + + for page in page_batch: + assert page._backend is not None + if not page._backend.is_valid(): + yield page + else: + with TimeRecorder(conv_res, "ocr"): + + ocr_rects = self.get_ocr_rects(page) + + all_ocr_cells = [] + for ocr_rect in ocr_rects: + # Skip zero area boxes + if ocr_rect.area() == 0: + continue + high_res_image = page._backend.get_page_image( + scale=self.scale, cropbox=ocr_rect + ) + + with tempfile.NamedTemporaryFile( + suffix=".png", mode="w" + ) as image_file: + fname = image_file.name + high_res_image.save(fname) + + boxes = self.reader_RIL( + fname, + recognition_level=self.options.recognition, + framework=self.options.framework, + language_preference=self.options.lang, + ).recognize() + + im_width, im_height = high_res_image.size + cells = [] + for ix, (text, confidence, box) in enumerate(boxes): + x = float(box[0]) + y = float(box[1]) + w = float(box[2]) + h = float(box[3]) + + x1 = x * im_width + y2 = (1 - y) * im_height + + x2 = x1 + w * im_width + y1 = y2 - h * im_height + + left = x1 / self.scale + top = y1 / self.scale + right = x2 / self.scale + bottom = y2 / self.scale + + cells.append( + OcrCell( + id=ix, + text=text, + confidence=confidence, + bbox=BoundingBox.from_tuple( + coord=(left, top, right, bottom), + origin=CoordOrigin.TOPLEFT, + ), + ) + ) + + # del high_res_image + all_ocr_cells.extend(cells) + + # Post-process the cells + page.cells = self.post_process_cells(all_ocr_cells, page.cells) + + # DEBUG code: + if settings.debug.visualize_ocr: + self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects) + + yield page diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py index 65803d4f..63a7a89f 100644 --- a/docling/pipeline/standard_pdf_pipeline.py +++ b/docling/pipeline/standard_pdf_pipeline.py @@ -1,4 +1,5 @@ import logging +import sys from pathlib import Path from typing import Optional @@ -10,6 +11,7 @@ from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( EasyOcrOptions, + OcrMacOptions, PdfPipelineOptions, TesseractCliOcrOptions, TesseractOcrOptions, @@ -18,6 +20,7 @@ from docling.models.ds_glm_model import GlmModel, GlmOptions from docling.models.easyocr_model import EasyOcrModel from docling.models.layout_model import LayoutModel +from docling.models.ocr_mac_model import OcrMacModel from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions from docling.models.page_preprocessing_model import ( PagePreprocessingModel, @@ -118,6 +121,15 @@ def get_ocr_model(self) -> Optional[BaseOcrModel]: enabled=self.pipeline_options.do_ocr, options=self.pipeline_options.ocr_options, ) + elif isinstance(self.pipeline_options.ocr_options, OcrMacOptions): + if "darwin" != sys.platform: + raise RuntimeError( + f"The specified OCR type is only supported on Mac: {self.pipeline_options.ocr_options.kind}." + ) + return OcrMacModel( + enabled=self.pipeline_options.do_ocr, + options=self.pipeline_options.ocr_options, + ) return None def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page: diff --git a/docs/examples/custom_convert.py b/docs/examples/custom_convert.py index 7631848b..2d300904 100644 --- a/docs/examples/custom_convert.py +++ b/docs/examples/custom_convert.py @@ -7,6 +7,7 @@ from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.document_converter import DocumentConverter, PdfFormatOption +from docling.models.ocr_mac_model import OcrMacOptions from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions from docling.models.tesseract_ocr_model import TesseractOcrOptions @@ -122,6 +123,20 @@ def main(): # } # ) + # Docling Parse with ocrmac(Mac only) + # ---------------------- + # pipeline_options = PdfPipelineOptions() + # pipeline_options.do_ocr = True + # pipeline_options.do_table_structure = True + # pipeline_options.table_structure_options.do_cell_matching = True + # pipeline_options.ocr_options = OcrMacOptions() + + # doc_converter = DocumentConverter( + # format_options={ + # InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) + # } + # ) + ########################################################################### start_time = time.time() diff --git a/docs/examples/full_page_ocr.py b/docs/examples/full_page_ocr.py index 35c2ba6b..bbb7e122 100644 --- a/docs/examples/full_page_ocr.py +++ b/docs/examples/full_page_ocr.py @@ -4,6 +4,7 @@ from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import ( EasyOcrOptions, + OcrMacOptions, PdfPipelineOptions, TesseractCliOcrOptions, TesseractOcrOptions, @@ -19,9 +20,10 @@ def main(): pipeline_options.do_table_structure = True pipeline_options.table_structure_options.do_cell_matching = True - # Any of the OCR options can be used:EasyOcrOptions, TesseractOcrOptions, TesseractCliOcrOptions + # Any of the OCR options can be used:EasyOcrOptions, TesseractOcrOptions, TesseractCliOcrOptions, OcrMacOptions(Mac only) # ocr_options = EasyOcrOptions(force_full_page_ocr=True) # ocr_options = TesseractOcrOptions(force_full_page_ocr=True) + # ocr_options = OcrMacOptions(force_full_page_ocr=True) ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True) pipeline_options.ocr_options = ocr_options diff --git a/docs/installation.md b/docs/installation.md index df18dece..addae382 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -30,6 +30,7 @@ Works on macOS, Linux, and Windows, with support for both x86_64 and arm64 archi | [EasyOCR](https://github.com/JaidedAI/EasyOCR) | Default in Docling or via `pip install easyocr`. | `EasyOcrOptions` | | Tesseract | System dependency. See description for Tesseract and Tesserocr below. | `TesseractOcrOptions` | | Tesseract CLI | System dependency. See description below. | `TesseractCliOcrOptions` | + | OcrMac | System dependency. See description below. | `OcrMacOptions` | The Docling `DocumentConverter` allows to choose the OCR engine with the `ocr_options` settings. For example @@ -91,6 +92,17 @@ Works on macOS, Linux, and Windows, with support for both x86_64 and arm64 archi pip install --no-binary :all: tesserocr ``` +

ocrmac installation

+ + [ocrmac](https://github.com/straussmaximilian/ocrmac) is using + Apple's vision(or livetext) framework as OCR backend. + For using this engine with Docling, ocrmac must be installed on your system. + This only works on macOS systems with newer macOS versions (10.15+). + + ```console + pip install ocrmac + ``` + ## Development setup To develop Docling features, bugfixes etc., install as follows from your local clone's root dir: diff --git a/poetry.lock b/poetry.lock index f0717c43..d8dd1c80 100644 --- a/poetry.lock +++ b/poetry.lock @@ -182,8 +182,8 @@ files = [ lazy-object-proxy = ">=1.4.0" typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.11\""} wrapt = [ - {version = ">=1.14,<2", markers = "python_version >= \"3.11\""}, {version = ">=1.11,<2", markers = "python_version < \"3.11\""}, + {version = ">=1.14,<2", markers = "python_version >= \"3.11\""}, ] [[package]] @@ -825,8 +825,8 @@ files = [ docling-core = ">=2.0,<3.0" docutils = "!=0.21" numpy = [ - {version = ">=2.0.2,<3.0.0", markers = "python_version >= \"3.13\""}, {version = ">=1.26.4,<2.0.0", markers = "python_version >= \"3.9\" and python_version < \"3.13\""}, + {version = ">=2.0.2,<3.0.0", markers = "python_version >= \"3.13\""}, ] pandas = {version = ">=2.1.4,<3.0.0", markers = "python_version >= \"3.9\""} python-dotenv = ">=1.0.0,<2.0.0" @@ -912,8 +912,8 @@ huggingface_hub = ">=0.23,<1" jsonlines = ">=3.1.0,<4.0.0" mean_average_precision = ">=2021.4.26.0,<2022.0.0.0" numpy = [ - {version = ">=2.1.0,<3.0.0", markers = "python_version >= \"3.13\""}, {version = ">=1.24.4,<2.0.0", markers = "python_version < \"3.13\""}, + {version = ">=2.1.0,<3.0.0", markers = "python_version >= \"3.13\""}, ] opencv-python-headless = ">=4.6.0.66,<5.0.0.0" Pillow = ">=10.0.0,<11.0.0" @@ -2063,8 +2063,8 @@ jsonpatch = ">=1.33,<2.0" langsmith = ">=0.1.112,<0.2.0" packaging = ">=23.2,<25" pydantic = [ - {version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""}, {version = ">=1,<3", markers = "python_full_version < \"3.12.4\""}, + {version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""}, ] PyYAML = ">=5.3" tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<9.0.0" @@ -2132,8 +2132,8 @@ files = [ httpx = ">=0.23.0,<1" orjson = ">=3.9.14,<4.0.0" pydantic = [ - {version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""}, {version = ">=1,<3", markers = "python_full_version < \"3.12.4\""}, + {version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""}, ] requests = ">=2,<3" requests-toolbelt = ">=1.0.0,<2.0.0" @@ -3548,6 +3548,22 @@ files = [ {file = "nvidia_nvtx_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:641dccaaa1139f3ffb0d3164b4b84f9d253397e38246a4f2f36728b48566d485"}, ] +[[package]] +name = "ocrmac" +version = "1.0.0" +description = "A python wrapper to extract text from images on a mac system. Uses the vision framework from Apple." +optional = true +python-versions = ">=3.6" +files = [ + {file = "ocrmac-1.0.0-py2.py3-none-any.whl", hash = "sha256:0b5a072aa23a9ead48132cb2d595b680aa6c3c5a6cb69525155e35ca95610c3a"}, + {file = "ocrmac-1.0.0.tar.gz", hash = "sha256:5b299e9030c973d1f60f82db000d6c2e5ff271601878c7db0885e850597d1d2e"}, +] + +[package.dependencies] +Click = ">=7.0" +pillow = "*" +pyobjc-framework-Vision = "*" + [[package]] name = "opencv-python-headless" version = "4.10.0.84" @@ -3566,10 +3582,10 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, ] [[package]] @@ -3732,9 +3748,9 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, {version = ">=1.22.4", markers = "python_version < \"3.11\""}, {version = ">=1.23.2", markers = "python_version == \"3.11\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, ] python-dateutil = ">=2.8.2" pytz = ">=2020.1" @@ -4331,8 +4347,8 @@ files = [ annotated-types = ">=0.6.0" pydantic-core = "2.23.4" typing-extensions = [ - {version = ">=4.12.2", markers = "python_version >= \"3.13\""}, {version = ">=4.6.1", markers = "python_version < \"3.13\""}, + {version = ">=4.12.2", markers = "python_version >= \"3.13\""}, ] [package.extras] @@ -4500,8 +4516,8 @@ files = [ astroid = ">=2.15.8,<=2.17.0-dev0" colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""} dill = [ - {version = ">=0.3.6", markers = "python_version >= \"3.11\""}, {version = ">=0.2", markers = "python_version < \"3.11\""}, + {version = ">=0.3.6", markers = "python_version >= \"3.11\""}, ] isort = ">=4.2.5,<6" mccabe = ">=0.6,<0.8" @@ -4556,6 +4572,102 @@ bulk-writer = ["azure-storage-blob", "minio (>=7.0.0)", "pyarrow (>=12.0.0)", "r dev = ["black", "grpcio (==1.62.2)", "grpcio-testing (==1.62.2)", "grpcio-tools (==1.62.2)", "pytest (>=5.3.4)", "pytest-cov (>=2.8.1)", "pytest-timeout (>=1.3.4)", "ruff (>0.4.0)"] model = ["milvus-model (>=0.1.0)"] +[[package]] +name = "pyobjc-core" +version = "10.3.1" +description = "Python<->ObjC Interoperability Module" +optional = true +python-versions = ">=3.8" +files = [ + {file = "pyobjc_core-10.3.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:ea46d2cda17921e417085ac6286d43ae448113158afcf39e0abe484c58fb3d78"}, + {file = "pyobjc_core-10.3.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:899d3c84d2933d292c808f385dc881a140cf08632907845043a333a9d7c899f9"}, + {file = "pyobjc_core-10.3.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:6ff5823d13d0a534cdc17fa4ad47cf5bee4846ce0fd27fc40012e12b46db571b"}, + {file = "pyobjc_core-10.3.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:2581e8e68885bcb0e11ec619e81ef28e08ee3fac4de20d8cc83bc5af5bcf4a90"}, + {file = "pyobjc_core-10.3.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ea98d4c2ec39ca29e62e0327db21418696161fb138ee6278daf2acbedf7ce504"}, + {file = "pyobjc_core-10.3.1-cp38-cp38-macosx_11_0_universal2.whl", hash = "sha256:4c179c26ee2123d0aabffb9dbc60324b62b6f8614fb2c2328b09386ef59ef6d8"}, + {file = "pyobjc_core-10.3.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:cb901fce65c9be420c40d8a6ee6fff5ff27c6945f44fd7191989b982baa66dea"}, + {file = "pyobjc_core-10.3.1.tar.gz", hash = "sha256:b204a80ccc070f9ab3f8af423a3a25a6fd787e228508d00c4c30f8ac538ba720"}, +] + +[[package]] +name = "pyobjc-framework-cocoa" +version = "10.3.1" +description = "Wrappers for the Cocoa frameworks on macOS" +optional = true +python-versions = ">=3.8" +files = [ + {file = "pyobjc_framework_Cocoa-10.3.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:4cb4f8491ab4d9b59f5187e42383f819f7a46306a4fa25b84f126776305291d1"}, + {file = "pyobjc_framework_Cocoa-10.3.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:5f31021f4f8fdf873b57a97ee1f3c1620dbe285e0b4eaed73dd0005eb72fd773"}, + {file = "pyobjc_framework_Cocoa-10.3.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:11b4e0bad4bbb44a4edda128612f03cdeab38644bbf174de0c13129715497296"}, + {file = "pyobjc_framework_Cocoa-10.3.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:de5e62e5ccf2871a94acf3bf79646b20ea893cc9db78afa8d1fe1b0d0f7cbdb0"}, + {file = "pyobjc_framework_Cocoa-10.3.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6c5af24610ab639bd1f521ce4500484b40787f898f691b7a23da3339e6bc8b90"}, + {file = "pyobjc_framework_Cocoa-10.3.1-cp38-cp38-macosx_11_0_universal2.whl", hash = "sha256:a7151186bb7805deea434fae9a4423335e6371d105f29e73cc2036c6779a9dbc"}, + {file = "pyobjc_framework_Cocoa-10.3.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:743d2a1ac08027fd09eab65814c79002a1d0421d7c0074ffd1217b6560889744"}, + {file = "pyobjc_framework_cocoa-10.3.1.tar.gz", hash = "sha256:1cf20714daaa986b488fb62d69713049f635c9d41a60c8da97d835710445281a"}, +] + +[package.dependencies] +pyobjc-core = ">=10.3.1" + +[[package]] +name = "pyobjc-framework-coreml" +version = "10.3.1" +description = "Wrappers for the framework CoreML on macOS" +optional = true +python-versions = ">=3.8" +files = [ + {file = "pyobjc_framework_CoreML-10.3.1-cp36-abi3-macosx_10_13_universal2.whl", hash = "sha256:c1fdcc0487807afa9cd0f88f25697e0e2e093d0219e8e1aa42aa3674dd78c2cb"}, + {file = "pyobjc_framework_CoreML-10.3.1-cp36-abi3-macosx_10_9_universal2.whl", hash = "sha256:21c87e84c807b5dbe61e0f016d9aefa32d3212f175cc4b976b5c08770be7a58c"}, + {file = "pyobjc_framework_CoreML-10.3.1-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:a0877aed5d4cdbb63d1246cd5384c09d78a0667e83c435a1257d10017c11c1a4"}, + {file = "pyobjc_framework_CoreML-10.3.1-cp36-abi3-macosx_11_0_universal2.whl", hash = "sha256:4bd3f1acfb3245727727b71cbcf7d21a33d7e00fa488e41ad01527764b969b92"}, + {file = "pyobjc_framework_coreml-10.3.1.tar.gz", hash = "sha256:6b7091142cfaafee76f1a804329e7a4e3aeca921eea8644e9ceba4cc2751f705"}, +] + +[package.dependencies] +pyobjc-core = ">=10.3.1" +pyobjc-framework-Cocoa = ">=10.3.1" + +[[package]] +name = "pyobjc-framework-quartz" +version = "10.3.1" +description = "Wrappers for the Quartz frameworks on macOS" +optional = true +python-versions = ">=3.8" +files = [ + {file = "pyobjc_framework_Quartz-10.3.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:5ef4fd315ed2bc42ef77fdeb2bae28a88ec986bd7b8079a87ba3b3475348f96e"}, + {file = "pyobjc_framework_Quartz-10.3.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:96578d4a3e70164efe44ad7dc320ecd4e211758ffcde5dcd694de1bbdfe090a4"}, + {file = "pyobjc_framework_Quartz-10.3.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:ca35f92486869a41847a1703bb176aab8a53dbfd8e678d1f4d68d8e6e1581c71"}, + {file = "pyobjc_framework_Quartz-10.3.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:00a0933267e3a46ea4afcc35d117b2efb920f06de797fa66279c52e7057e3590"}, + {file = "pyobjc_framework_Quartz-10.3.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:a161bedb4c5257a02ad56a910cd7eefb28bdb0ea78607df0d70ed4efe4ea54c1"}, + {file = "pyobjc_framework_Quartz-10.3.1-cp38-cp38-macosx_11_0_universal2.whl", hash = "sha256:d7a8028e117a94923a511944bfa9daf9744e212f06cf89010c60934a479863a5"}, + {file = "pyobjc_framework_Quartz-10.3.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:de00c983b3267eb26fa42c6ed9f15e2bf006bde8afa7fe2b390646aa21a5d6fc"}, + {file = "pyobjc_framework_quartz-10.3.1.tar.gz", hash = "sha256:b6d7e346d735c9a7f147cd78e6da79eeae416a0b7d3874644c83a23786c6f886"}, +] + +[package.dependencies] +pyobjc-core = ">=10.3.1" +pyobjc-framework-Cocoa = ">=10.3.1" + +[[package]] +name = "pyobjc-framework-vision" +version = "10.3.1" +description = "Wrappers for the framework Vision on macOS" +optional = true +python-versions = ">=3.8" +files = [ + {file = "pyobjc_framework_Vision-10.3.1-cp36-abi3-macosx_10_13_universal2.whl", hash = "sha256:dff3582678930461a0bb11bf070854d49f6944a851dc89edc63fac93c75ddf39"}, + {file = "pyobjc_framework_Vision-10.3.1-cp36-abi3-macosx_10_9_universal2.whl", hash = "sha256:32626183c51674efb3b5738e2884c3fea37edca010117cf71bd72cb3c49c869a"}, + {file = "pyobjc_framework_Vision-10.3.1-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:2473b346a112c51ac485184305bd13c402e0db45f2df3d277315bd49efba18e9"}, + {file = "pyobjc_framework_Vision-10.3.1-cp36-abi3-macosx_11_0_universal2.whl", hash = "sha256:4302e2c5f68c9667ecd4273809cbc4611af6368b123d69596e5b088f1b1aa16b"}, + {file = "pyobjc_framework_vision-10.3.1.tar.gz", hash = "sha256:aa071656d395afc2d624600a9f30d6a3344aa747bf37f613ff3972158c40881c"}, +] + +[package.dependencies] +pyobjc-core = ">=10.3.1" +pyobjc-framework-Cocoa = ">=10.3.1" +pyobjc-framework-CoreML = ">=10.3.1" +pyobjc-framework-Quartz = ">=10.3.1" + [[package]] name = "pypdfium2" version = "4.30.0" @@ -7248,9 +7360,10 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", type = ["pytest-mypy"] [extras] +ocrmac = ["ocrmac"] tesserocr = ["tesserocr"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "a0f599090cfd9414c0e90fd611fd0b23166a45cd925904491eb0503a6f6bd1d8" +content-hash = "129137f8229158ac7672919df1684a260f74db22517d4d40c905f801f2950f46" diff --git a/pyproject.toml b/pyproject.toml index abca5314..52c68035 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,6 +48,7 @@ beautifulsoup4 = "^4.12.3" pandas = "^2.1.4" marko = "^2.1.2" openpyxl = "^3.1.5" +ocrmac = { version = "^1.0.0", markers = "sys_platform == 'darwin'", optional = true } [tool.poetry.group.dev.dependencies] black = {extras = ["jupyter"], version = "^24.4.2"} @@ -95,6 +96,7 @@ torchvision = [ [tool.poetry.extras] tesserocr = ["tesserocr"] +ocrmac = ["ocrmac"] [tool.poetry.scripts] docling = "docling.cli.main:app" @@ -130,6 +132,7 @@ module = [ "tesserocr.*", "docling_ibm_models.*", "easyocr.*", + "ocrmac.*", "deepsearch_glm.*", "lxml.*", "bs4.*", diff --git a/tests/test_e2e_ocr_conversion.py b/tests/test_e2e_ocr_conversion.py index 324a4a14..68dac33e 100644 --- a/tests/test_e2e_ocr_conversion.py +++ b/tests/test_e2e_ocr_conversion.py @@ -1,3 +1,4 @@ +import sys from pathlib import Path from typing import List @@ -6,6 +7,7 @@ from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( EasyOcrOptions, + OcrMacOptions, OcrOptions, PdfPipelineOptions, TesseractCliOcrOptions, @@ -59,6 +61,11 @@ def test_e2e_conversions(): TesseractCliOcrOptions(force_full_page_ocr=True), ] + # only works on mac + if "darwin" == sys.platform: + engines.append(OcrMacOptions()) + engines.append(OcrMacOptions(force_full_page_ocr=True)) + for ocr_options in engines: print(f"Converting with ocr_engine: {ocr_options.kind}") converter = get_converter(ocr_options=ocr_options)