From 85b29990be6468516b6dbe49f880d9f1f4c11c5a Mon Sep 17 00:00:00 2001 From: Swaymaw <87603098+Swaymaw@users.noreply.github.com> Date: Wed, 27 Nov 2024 18:27:41 +0530 Subject: [PATCH] feat(ocr): added support for RapidOCR engine (#415) * adding rapidocr engine for ocr in docling Signed-off-by: swayam-singhal * fixing styling format Signed-off-by: Swaymaw * updating pyproject.toml and poetry.lock to fix ci bugs Signed-off-by: Swaymaw * help poetry pinning for python3.9 Signed-off-by: Michele Dolfi * simplifying rapidocr options so that device can be changed using a single option for all models Signed-off-by: Swaymaw * fix styling issues and small bug in rapidOcrOptions Signed-off-by: Swaymaw * use default device until we enable global management Signed-off-by: Michele Dolfi --------- Signed-off-by: swayam-singhal Signed-off-by: Swaymaw Signed-off-by: Michele Dolfi Co-authored-by: swayam-singhal Co-authored-by: Michele Dolfi --- docling/cli/main.py | 4 + docling/datamodel/pipeline_options.py | 36 ++++ docling/models/rapid_ocr_model.py | 147 +++++++++++++++ docling/pipeline/standard_pdf_pipeline.py | 7 + docs/examples/full_page_ocr.py | 4 +- docs/installation.md | 1 + poetry.lock | 208 ++++++++++++++++++++-- pyproject.toml | 8 + tests/test_e2e_ocr_conversion.py | 3 + 9 files changed, 405 insertions(+), 13 deletions(-) create mode 100644 docling/models/rapid_ocr_model.py diff --git a/docling/cli/main.py b/docling/cli/main.py index 09102abd..d6f51b74 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -27,6 +27,7 @@ OcrMacOptions, OcrOptions, PdfPipelineOptions, + RapidOcrOptions, TableFormerMode, TesseractCliOcrOptions, TesseractOcrOptions, @@ -76,6 +77,7 @@ class OcrEngine(str, Enum): TESSERACT_CLI = "tesseract_cli" TESSERACT = "tesseract" OCRMAC = "ocrmac" + RAPIDOCR = "rapidocr" def export_documents( @@ -262,6 +264,8 @@ def convert( ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr) elif ocr_engine == OcrEngine.OCRMAC: ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr) + elif ocr_engine == OcrEngine.RAPIDOCR: + ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr) else: raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}") diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index b691215c..58a7bb97 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -29,6 +29,42 @@ class OcrOptions(BaseModel): ) +class RapidOcrOptions(OcrOptions): + kind: Literal["rapidocr"] = "rapidocr" + + # English and chinese are the most commly used models and have been tested with RapidOCR. + lang: List[str] = [ + "english", + "chinese", + ] # However, language as a parameter is not supported by rapidocr yet and hence changing this options doesn't affect anything. + # For more details on supported languages by RapidOCR visit https://rapidai.github.io/RapidOCRDocs/blog/2022/09/28/%E6%94%AF%E6%8C%81%E8%AF%86%E5%88%AB%E8%AF%AD%E8%A8%80/ + + # For more details on the following options visit https://rapidai.github.io/RapidOCRDocs/install_usage/api/RapidOCR/ + text_score: float = 0.5 # same default as rapidocr + + use_det: Optional[bool] = None # same default as rapidocr + use_cls: Optional[bool] = None # same default as rapidocr + use_rec: Optional[bool] = None # same default as rapidocr + + # class Device(Enum): + # CPU = "CPU" + # CUDA = "CUDA" + # DIRECTML = "DIRECTML" + # AUTO = "AUTO" + + # device: Device = Device.AUTO # Default value is AUTO + + print_verbose: bool = False # same default as rapidocr + + det_model_path: Optional[str] = None # same default as rapidocr + cls_model_path: Optional[str] = None # same default as rapidocr + rec_model_path: Optional[str] = None # same default as rapidocr + + model_config = ConfigDict( + extra="forbid", + ) + + class EasyOcrOptions(OcrOptions): kind: Literal["easyocr"] = "easyocr" lang: List[str] = ["fr", "de", "es", "en"] diff --git a/docling/models/rapid_ocr_model.py b/docling/models/rapid_ocr_model.py new file mode 100644 index 00000000..7fd5a3d4 --- /dev/null +++ b/docling/models/rapid_ocr_model.py @@ -0,0 +1,147 @@ +import logging +from typing import Iterable + +import numpy +from docling_core.types.doc import BoundingBox, CoordOrigin + +from docling.datamodel.base_models import OcrCell, Page +from docling.datamodel.document import ConversionResult +from docling.datamodel.pipeline_options import RapidOcrOptions +from docling.datamodel.settings import settings +from docling.models.base_ocr_model import BaseOcrModel +from docling.utils.profiling import TimeRecorder + +_log = logging.getLogger(__name__) + + +class RapidOcrModel(BaseOcrModel): + def __init__(self, enabled: bool, options: RapidOcrOptions): + super().__init__(enabled=enabled, options=options) + self.options: RapidOcrOptions + + self.scale = 3 # multiplier for 72 dpi == 216 dpi. + + if self.enabled: + try: + from rapidocr_onnxruntime import RapidOCR # type: ignore + except ImportError: + raise ImportError( + "RapidOCR is not installed. Please install it via `pip install rapidocr_onnxruntime` to use this OCR engine. " + "Alternatively, Docling has support for other OCR engines. See the documentation." + ) + + # This configuration option will be revamped while introducing device settings for all models. + # For the moment we will default to auto and let onnx-runtime pick the best. + cls_use_cuda = True + rec_use_cuda = True + det_use_cuda = True + det_use_dml = True + cls_use_dml = True + rec_use_dml = True + + # # Same as Defaults in RapidOCR + # cls_use_cuda = False + # rec_use_cuda = False + # det_use_cuda = False + # det_use_dml = False + # cls_use_dml = False + # rec_use_dml = False + + # # If we set everything to true onnx-runtime would automatically choose the fastest accelerator + # if self.options.device == self.options.Device.AUTO: + # cls_use_cuda = True + # rec_use_cuda = True + # det_use_cuda = True + # det_use_dml = True + # cls_use_dml = True + # rec_use_dml = True + + # # If we set use_cuda to true onnx would use the cuda device available in runtime if no cuda device is available it would run on CPU. + # elif self.options.device == self.options.Device.CUDA: + # cls_use_cuda = True + # rec_use_cuda = True + # det_use_cuda = True + + # # If we set use_dml to true onnx would use the dml device available in runtime if no dml device is available it would work on CPU. + # elif self.options.device == self.options.Device.DIRECTML: + # det_use_dml = True + # cls_use_dml = True + # rec_use_dml = True + + self.reader = RapidOCR( + text_score=self.options.text_score, + cls_use_cuda=cls_use_cuda, + rec_use_cuda=rec_use_cuda, + det_use_cuda=det_use_cuda, + det_use_dml=det_use_dml, + cls_use_dml=cls_use_dml, + rec_use_dml=rec_use_dml, + print_verbose=self.options.print_verbose, + det_model_path=self.options.det_model_path, + cls_model_path=self.options.cls_model_path, + rec_model_path=self.options.rec_model_path, + ) + + def __call__( + self, conv_res: ConversionResult, page_batch: Iterable[Page] + ) -> Iterable[Page]: + + if not self.enabled: + yield from page_batch + return + + for page in page_batch: + + assert page._backend is not None + if not page._backend.is_valid(): + yield page + else: + with TimeRecorder(conv_res, "ocr"): + ocr_rects = self.get_ocr_rects(page) + + all_ocr_cells = [] + for ocr_rect in ocr_rects: + # Skip zero area boxes + if ocr_rect.area() == 0: + continue + high_res_image = page._backend.get_page_image( + scale=self.scale, cropbox=ocr_rect + ) + im = numpy.array(high_res_image) + result, _ = self.reader( + im, + use_det=self.options.use_det, + use_cls=self.options.use_cls, + use_rec=self.options.use_rec, + ) + + del high_res_image + del im + + cells = [ + OcrCell( + id=ix, + text=line[1], + confidence=line[2], + bbox=BoundingBox.from_tuple( + coord=( + (line[0][0][0] / self.scale) + ocr_rect.l, + (line[0][0][1] / self.scale) + ocr_rect.t, + (line[0][2][0] / self.scale) + ocr_rect.l, + (line[0][2][1] / self.scale) + ocr_rect.t, + ), + origin=CoordOrigin.TOPLEFT, + ), + ) + for ix, line in enumerate(result) + ] + all_ocr_cells.extend(cells) + + # Post-process the cells + page.cells = self.post_process_cells(all_ocr_cells, page.cells) + + # DEBUG code: + if settings.debug.visualize_ocr: + self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects) + + yield page diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py index 63a7a89f..40105a38 100644 --- a/docling/pipeline/standard_pdf_pipeline.py +++ b/docling/pipeline/standard_pdf_pipeline.py @@ -13,6 +13,7 @@ EasyOcrOptions, OcrMacOptions, PdfPipelineOptions, + RapidOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, ) @@ -26,6 +27,7 @@ PagePreprocessingModel, PagePreprocessingOptions, ) +from docling.models.rapid_ocr_model import RapidOcrModel from docling.models.table_structure_model import TableStructureModel from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel from docling.models.tesseract_ocr_model import TesseractOcrModel @@ -121,6 +123,11 @@ def get_ocr_model(self) -> Optional[BaseOcrModel]: enabled=self.pipeline_options.do_ocr, options=self.pipeline_options.ocr_options, ) + elif isinstance(self.pipeline_options.ocr_options, RapidOcrOptions): + return RapidOcrModel( + enabled=self.pipeline_options.do_ocr, + options=self.pipeline_options.ocr_options, + ) elif isinstance(self.pipeline_options.ocr_options, OcrMacOptions): if "darwin" != sys.platform: raise RuntimeError( diff --git a/docs/examples/full_page_ocr.py b/docs/examples/full_page_ocr.py index bbb7e122..967910dc 100644 --- a/docs/examples/full_page_ocr.py +++ b/docs/examples/full_page_ocr.py @@ -6,6 +6,7 @@ EasyOcrOptions, OcrMacOptions, PdfPipelineOptions, + RapidOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, ) @@ -20,10 +21,11 @@ def main(): pipeline_options.do_table_structure = True pipeline_options.table_structure_options.do_cell_matching = True - # Any of the OCR options can be used:EasyOcrOptions, TesseractOcrOptions, TesseractCliOcrOptions, OcrMacOptions(Mac only) + # Any of the OCR options can be used:EasyOcrOptions, TesseractOcrOptions, TesseractCliOcrOptions, OcrMacOptions(Mac only), RapidOcrOptions # ocr_options = EasyOcrOptions(force_full_page_ocr=True) # ocr_options = TesseractOcrOptions(force_full_page_ocr=True) # ocr_options = OcrMacOptions(force_full_page_ocr=True) + # ocr_options = RapidOcrOptions(force_full_page_ocr=True) ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True) pipeline_options.ocr_options = ocr_options diff --git a/docs/installation.md b/docs/installation.md index addae382..fec7c632 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -31,6 +31,7 @@ Works on macOS, Linux, and Windows, with support for both x86_64 and arm64 archi | Tesseract | System dependency. See description for Tesseract and Tesserocr below. | `TesseractOcrOptions` | | Tesseract CLI | System dependency. See description below. | `TesseractCliOcrOptions` | | OcrMac | System dependency. See description below. | `OcrMacOptions` | + | [RapidOCR](https://github.com/RapidAI/RapidOCR) | Extra feature not included in Default Docling installation can be installed via `pip install rapidocr_onnxruntime` | `RapidOcrOptions` | The Docling `DocumentConverter` allows to choose the OCR engine with the `ocr_options` settings. For example diff --git a/poetry.lock b/poetry.lock index 32454b75..7ee6070a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -182,8 +182,8 @@ files = [ lazy-object-proxy = ">=1.4.0" typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.11\""} wrapt = [ - {version = ">=1.14,<2", markers = "python_version >= \"3.11\""}, {version = ">=1.11,<2", markers = "python_version < \"3.11\""}, + {version = ">=1.14,<2", markers = "python_version >= \"3.11\""}, ] [[package]] @@ -618,6 +618,23 @@ files = [ {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, ] +[[package]] +name = "coloredlogs" +version = "15.0.1" +description = "Colored terminal output for Python's logging module" +optional = true +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +files = [ + {file = "coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934"}, + {file = "coloredlogs-15.0.1.tar.gz", hash = "sha256:7c991aa71a4577af2f82600d8f8f3a89f936baeaf9b50a9c197da014e5bf16b0"}, +] + +[package.dependencies] +humanfriendly = ">=9.1" + +[package.extras] +cron = ["capturer (>=2.4)"] + [[package]] name = "comm" version = "0.2.2" @@ -825,8 +842,8 @@ files = [ docling-core = ">=2.0,<3.0" docutils = "!=0.21" numpy = [ - {version = ">=2.0.2,<3.0.0", markers = "python_version >= \"3.13\""}, {version = ">=1.26.4,<2.0.0", markers = "python_version >= \"3.9\" and python_version < \"3.13\""}, + {version = ">=2.0.2,<3.0.0", markers = "python_version >= \"3.13\""}, ] pandas = {version = ">=2.1.4,<3.0.0", markers = "python_version >= \"3.9\""} python-dotenv = ">=1.0.0,<2.0.0" @@ -912,8 +929,8 @@ files = [ huggingface_hub = ">=0.23,<1" jsonlines = ">=3.1.0,<4.0.0" numpy = [ - {version = ">=2.1.0,<3.0.0", markers = "python_version >= \"3.13\""}, {version = ">=1.24.4,<2.0.0", markers = "python_version < \"3.13\""}, + {version = ">=2.1.0,<3.0.0", markers = "python_version >= \"3.13\""}, ] opencv-python-headless = ">=4.6.0.66,<5.0.0.0" Pillow = ">=10.0.0,<11.0.0" @@ -1145,6 +1162,17 @@ TOMLi = {version = "*", markers = "python_version < \"3.11\""} [package.extras] dev = ["pyTest", "pyTest-cov"] +[[package]] +name = "flatbuffers" +version = "24.3.25" +description = "The FlatBuffers serialization format for Python" +optional = true +python-versions = "*" +files = [ + {file = "flatbuffers-24.3.25-py2.py3-none-any.whl", hash = "sha256:8dbdec58f935f3765e4f7f3cf635ac3a77f83568138d6a2311f524ec96364812"}, + {file = "flatbuffers-24.3.25.tar.gz", hash = "sha256:de2ec5b203f21441716617f38443e0a8ebf3d25bf0d9c0bb0ce68fa00ad546a4"}, +] + [[package]] name = "frozenlist" version = "1.5.0" @@ -1495,6 +1523,20 @@ testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gr torch = ["safetensors[torch]", "torch"] typing = ["types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)"] +[[package]] +name = "humanfriendly" +version = "10.0" +description = "Human friendly output for text interfaces using Python" +optional = true +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +files = [ + {file = "humanfriendly-10.0-py2.py3-none-any.whl", hash = "sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477"}, + {file = "humanfriendly-10.0.tar.gz", hash = "sha256:6b0b831ce8f15f7300721aa49829fc4e83921a9a301cc7f606be6686a2288ddc"}, +] + +[package.dependencies] +pyreadline3 = {version = "*", markers = "sys_platform == \"win32\" and python_version >= \"3.8\""} + [[package]] name = "identify" version = "2.6.3" @@ -2050,8 +2092,8 @@ jsonpatch = ">=1.33,<2.0" langsmith = ">=0.1.112,<0.2.0" packaging = ">=23.2,<25" pydantic = [ - {version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""}, {version = ">=1,<3", markers = "python_full_version < \"3.12.4\""}, + {version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""}, ] PyYAML = ">=5.3" tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<9.0.0" @@ -2119,8 +2161,8 @@ files = [ httpx = ">=0.23.0,<1" orjson = {version = ">=3.9.14,<4.0.0", markers = "platform_python_implementation != \"PyPy\""} pydantic = [ - {version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""}, {version = ">=1,<3", markers = "python_full_version < \"3.12.4\""}, + {version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""}, ] requests = ">=2,<3" requests-toolbelt = ">=1.0.0,<2.0.0" @@ -3524,6 +3566,112 @@ Click = ">=7.0" pillow = "*" pyobjc-framework-Vision = "*" +[[package]] +name = "onnxruntime" +version = "1.19.2" +description = "ONNX Runtime is a runtime accelerator for Machine Learning models" +optional = true +python-versions = "*" +files = [ + {file = "onnxruntime-1.19.2-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:84fa57369c06cadd3c2a538ae2a26d76d583e7c34bdecd5769d71ca5c0fc750e"}, + {file = "onnxruntime-1.19.2-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bdc471a66df0c1cdef774accef69e9f2ca168c851ab5e4f2f3341512c7ef4666"}, + {file = "onnxruntime-1.19.2-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e3a4ce906105d99ebbe817f536d50a91ed8a4d1592553f49b3c23c4be2560ae6"}, + {file = "onnxruntime-1.19.2-cp310-cp310-win32.whl", hash = "sha256:4b3d723cc154c8ddeb9f6d0a8c0d6243774c6b5930847cc83170bfe4678fafb3"}, + {file = "onnxruntime-1.19.2-cp310-cp310-win_amd64.whl", hash = "sha256:17ed7382d2c58d4b7354fb2b301ff30b9bf308a1c7eac9546449cd122d21cae5"}, + {file = "onnxruntime-1.19.2-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:d863e8acdc7232d705d49e41087e10b274c42f09e259016a46f32c34e06dc4fd"}, + {file = "onnxruntime-1.19.2-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c1dfe4f660a71b31caa81fc298a25f9612815215a47b286236e61d540350d7b6"}, + {file = "onnxruntime-1.19.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a36511dc07c5c964b916697e42e366fa43c48cdb3d3503578d78cef30417cb84"}, + {file = "onnxruntime-1.19.2-cp311-cp311-win32.whl", hash = "sha256:50cbb8dc69d6befad4746a69760e5b00cc3ff0a59c6c3fb27f8afa20e2cab7e7"}, + {file = "onnxruntime-1.19.2-cp311-cp311-win_amd64.whl", hash = "sha256:1c3e5d415b78337fa0b1b75291e9ea9fb2a4c1f148eb5811e7212fed02cfffa8"}, + {file = "onnxruntime-1.19.2-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:68e7051bef9cfefcbb858d2d2646536829894d72a4130c24019219442b1dd2ed"}, + {file = "onnxruntime-1.19.2-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d2d366fbcc205ce68a8a3bde2185fd15c604d9645888703785b61ef174265168"}, + {file = "onnxruntime-1.19.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:477b93df4db467e9cbf34051662a4b27c18e131fa1836e05974eae0d6e4cf29b"}, + {file = "onnxruntime-1.19.2-cp312-cp312-win32.whl", hash = "sha256:9a174073dc5608fad05f7cf7f320b52e8035e73d80b0a23c80f840e5a97c0147"}, + {file = "onnxruntime-1.19.2-cp312-cp312-win_amd64.whl", hash = "sha256:190103273ea4507638ffc31d66a980594b237874b65379e273125150eb044857"}, + {file = "onnxruntime-1.19.2-cp38-cp38-macosx_11_0_universal2.whl", hash = "sha256:636bc1d4cc051d40bc52e1f9da87fbb9c57d9d47164695dfb1c41646ea51ea66"}, + {file = "onnxruntime-1.19.2-cp38-cp38-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5bd8b875757ea941cbcfe01582970cc299893d1b65bd56731e326a8333f638a3"}, + {file = "onnxruntime-1.19.2-cp38-cp38-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b2046fc9560f97947bbc1acbe4c6d48585ef0f12742744307d3364b131ac5778"}, + {file = "onnxruntime-1.19.2-cp38-cp38-win32.whl", hash = "sha256:31c12840b1cde4ac1f7d27d540c44e13e34f2345cf3642762d2a3333621abb6a"}, + {file = "onnxruntime-1.19.2-cp38-cp38-win_amd64.whl", hash = "sha256:016229660adea180e9a32ce218b95f8f84860a200f0f13b50070d7d90e92956c"}, + {file = "onnxruntime-1.19.2-cp39-cp39-macosx_11_0_universal2.whl", hash = "sha256:006c8d326835c017a9e9f74c9c77ebb570a71174a1e89fe078b29a557d9c3848"}, + {file = "onnxruntime-1.19.2-cp39-cp39-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:df2a94179a42d530b936f154615b54748239c2908ee44f0d722cb4df10670f68"}, + {file = "onnxruntime-1.19.2-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fae4b4de45894b9ce7ae418c5484cbf0341db6813effec01bb2216091c52f7fb"}, + {file = "onnxruntime-1.19.2-cp39-cp39-win32.whl", hash = "sha256:dc5430f473e8706fff837ae01323be9dcfddd3ea471c900a91fa7c9b807ec5d3"}, + {file = "onnxruntime-1.19.2-cp39-cp39-win_amd64.whl", hash = "sha256:38475e29a95c5f6c62c2c603d69fc7d4c6ccbf4df602bd567b86ae1138881c49"}, +] + +[package.dependencies] +coloredlogs = "*" +flatbuffers = "*" +numpy = ">=1.21.6" +packaging = "*" +protobuf = "*" +sympy = "*" + +[[package]] +name = "onnxruntime" +version = "1.20.1" +description = "ONNX Runtime is a runtime accelerator for Machine Learning models" +optional = true +python-versions = "*" +files = [ + {file = "onnxruntime-1.20.1-cp310-cp310-macosx_13_0_universal2.whl", hash = "sha256:e50ba5ff7fed4f7d9253a6baf801ca2883cc08491f9d32d78a80da57256a5439"}, + {file = "onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7b2908b50101a19e99c4d4e97ebb9905561daf61829403061c1adc1b588bc0de"}, + {file = "onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d82daaec24045a2e87598b8ac2b417b1cce623244e80e663882e9fe1aae86410"}, + {file = "onnxruntime-1.20.1-cp310-cp310-win32.whl", hash = "sha256:4c4b251a725a3b8cf2aab284f7d940c26094ecd9d442f07dd81ab5470e99b83f"}, + {file = "onnxruntime-1.20.1-cp310-cp310-win_amd64.whl", hash = "sha256:d3b616bb53a77a9463707bb313637223380fc327f5064c9a782e8ec69c22e6a2"}, + {file = "onnxruntime-1.20.1-cp311-cp311-macosx_13_0_universal2.whl", hash = "sha256:06bfbf02ca9ab5f28946e0f912a562a5f005301d0c419283dc57b3ed7969bb7b"}, + {file = "onnxruntime-1.20.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f6243e34d74423bdd1edf0ae9596dd61023b260f546ee17d701723915f06a9f7"}, + {file = "onnxruntime-1.20.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5eec64c0269dcdb8d9a9a53dc4d64f87b9e0c19801d9321246a53b7eb5a7d1bc"}, + {file = "onnxruntime-1.20.1-cp311-cp311-win32.whl", hash = "sha256:a19bc6e8c70e2485a1725b3d517a2319603acc14c1f1a017dda0afe6d4665b41"}, + {file = "onnxruntime-1.20.1-cp311-cp311-win_amd64.whl", hash = "sha256:8508887eb1c5f9537a4071768723ec7c30c28eb2518a00d0adcd32c89dea3221"}, + {file = "onnxruntime-1.20.1-cp312-cp312-macosx_13_0_universal2.whl", hash = "sha256:22b0655e2bf4f2161d52706e31f517a0e54939dc393e92577df51808a7edc8c9"}, + {file = "onnxruntime-1.20.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f1f56e898815963d6dc4ee1c35fc6c36506466eff6d16f3cb9848cea4e8c8172"}, + {file = "onnxruntime-1.20.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bb71a814f66517a65628c9e4a2bb530a6edd2cd5d87ffa0af0f6f773a027d99e"}, + {file = "onnxruntime-1.20.1-cp312-cp312-win32.whl", hash = "sha256:bd386cc9ee5f686ee8a75ba74037750aca55183085bf1941da8efcfe12d5b120"}, + {file = "onnxruntime-1.20.1-cp312-cp312-win_amd64.whl", hash = "sha256:19c2d843eb074f385e8bbb753a40df780511061a63f9def1b216bf53860223fb"}, + {file = "onnxruntime-1.20.1-cp313-cp313-macosx_13_0_universal2.whl", hash = "sha256:cc01437a32d0042b606f462245c8bbae269e5442797f6213e36ce61d5abdd8cc"}, + {file = "onnxruntime-1.20.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fb44b08e017a648924dbe91b82d89b0c105b1adcfe31e90d1dc06b8677ad37be"}, + {file = "onnxruntime-1.20.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bda6aebdf7917c1d811f21d41633df00c58aff2bef2f598f69289c1f1dabc4b3"}, + {file = "onnxruntime-1.20.1-cp313-cp313-win_amd64.whl", hash = "sha256:d30367df7e70f1d9fc5a6a68106f5961686d39b54d3221f760085524e8d38e16"}, + {file = "onnxruntime-1.20.1-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c9158465745423b2b5d97ed25aa7740c7d38d2993ee2e5c3bfacb0c4145c49d8"}, + {file = "onnxruntime-1.20.1-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0df6f2df83d61f46e842dbcde610ede27218947c33e994545a22333491e72a3b"}, +] + +[package.dependencies] +coloredlogs = "*" +flatbuffers = "*" +numpy = ">=1.21.6" +packaging = "*" +protobuf = "*" +sympy = "*" + +[[package]] +name = "opencv-python" +version = "4.10.0.84" +description = "Wrapper package for OpenCV python bindings." +optional = true +python-versions = ">=3.6" +files = [ + {file = "opencv-python-4.10.0.84.tar.gz", hash = "sha256:72d234e4582e9658ffea8e9cae5b63d488ad06994ef12d81dc303b17472f3526"}, + {file = "opencv_python-4.10.0.84-cp37-abi3-macosx_11_0_arm64.whl", hash = "sha256:fc182f8f4cda51b45f01c64e4cbedfc2f00aff799debebc305d8d0210c43f251"}, + {file = "opencv_python-4.10.0.84-cp37-abi3-macosx_12_0_x86_64.whl", hash = "sha256:71e575744f1d23f79741450254660442785f45a0797212852ee5199ef12eed98"}, + {file = "opencv_python-4.10.0.84-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:09a332b50488e2dda866a6c5573ee192fe3583239fb26ff2f7f9ceb0bc119ea6"}, + {file = "opencv_python-4.10.0.84-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9ace140fc6d647fbe1c692bcb2abce768973491222c067c131d80957c595b71f"}, + {file = "opencv_python-4.10.0.84-cp37-abi3-win32.whl", hash = "sha256:2db02bb7e50b703f0a2d50c50ced72e95c574e1e5a0bb35a8a86d0b35c98c236"}, + {file = "opencv_python-4.10.0.84-cp37-abi3-win_amd64.whl", hash = "sha256:32dbbd94c26f611dc5cc6979e6b7aa1f55a64d6b463cc1dcd3c95505a63e48fe"}, +] + +[package.dependencies] +numpy = [ + {version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""}, + {version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, + {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, + {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, + {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, +] + [[package]] name = "opencv-python-headless" version = "4.10.0.84" @@ -3542,12 +3690,12 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, {version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""}, - {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, - {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, {version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, + {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, + {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, ] [[package]] @@ -3727,8 +3875,8 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, {version = ">=1.22.4", markers = "python_version < \"3.11\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, {version = ">=1.23.2", markers = "python_version == \"3.11\""}, ] python-dateutil = ">=2.8.2" @@ -4326,8 +4474,8 @@ files = [ annotated-types = ">=0.6.0" pydantic-core = "2.23.4" typing-extensions = [ - {version = ">=4.12.2", markers = "python_version >= \"3.13\""}, {version = ">=4.6.1", markers = "python_version < \"3.13\""}, + {version = ">=4.12.2", markers = "python_version >= \"3.13\""}, ] [package.extras] @@ -4495,8 +4643,8 @@ files = [ astroid = ">=2.15.8,<=2.17.0-dev0" colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""} dill = [ - {version = ">=0.3.6", markers = "python_version >= \"3.11\""}, {version = ">=0.2", markers = "python_version < \"3.11\""}, + {version = ">=0.3.6", markers = "python_version >= \"3.11\""}, ] isort = ">=4.2.5,<6" mccabe = ">=0.6,<0.8" @@ -4685,6 +4833,20 @@ files = [ flake8 = "6.1.0" tomli = {version = "*", markers = "python_version < \"3.11\""} +[[package]] +name = "pyreadline3" +version = "3.5.4" +description = "A python implementation of GNU readline." +optional = true +python-versions = ">=3.8" +files = [ + {file = "pyreadline3-3.5.4-py3-none-any.whl", hash = "sha256:eaf8e6cc3c49bcccf145fc6067ba8643d1df34d604a1ec0eccbf7a18e6d3fae6"}, + {file = "pyreadline3-3.5.4.tar.gz", hash = "sha256:8d57d53039a1c75adba8e50dd3d992b28143480816187ea5efbd5c78e6c885b7"}, +] + +[package.extras] +dev = ["build", "flake8", "mypy", "pytest", "twine"] + [[package]] name = "pytest" version = "7.4.4" @@ -5216,6 +5378,27 @@ files = [ [package.dependencies] cffi = {version = "*", markers = "implementation_name == \"pypy\""} +[[package]] +name = "rapidocr-onnxruntime" +version = "1.4.0" +description = "A cross platform OCR Library based on OnnxRuntime." +optional = true +python-versions = "<3.13,>=3.6" +files = [ + {file = "rapidocr_onnxruntime-1.4.0-py3-none-any.whl", hash = "sha256:d21c4ba2ef80b7a8ecf8178632f273398a92ab44a1ffb9e171139ef2a589d690"}, +] + +[package.dependencies] +numpy = ">=1.19.5,<3.0.0" +onnxruntime = ">=1.7.0" +opencv-python = ">=4.5.1.48" +Pillow = "*" +pyclipper = ">=1.2.0" +PyYAML = "*" +Shapely = ">=1.7.1,<2.0.4 || >2.0.4" +six = ">=1.15.0" +tqdm = "*" + [[package]] name = "readme-renderer" version = "44.0" @@ -7226,9 +7409,10 @@ type = ["pytest-mypy"] [extras] ocrmac = ["ocrmac"] +rapidocr = ["onnxruntime", "onnxruntime", "rapidocr-onnxruntime"] tesserocr = ["tesserocr"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "0e1ba9bc3f10cdd3ddf161469701ca603279bad13505ff3b7b042826e2de4c35" +content-hash = "c08324f73fb809466ad3494605a6745ec6c9f38b60e7b1f516f3f93a29534ca4" diff --git a/pyproject.toml b/pyproject.toml index f387657a..86050cac 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,6 +50,13 @@ marko = "^2.1.2" openpyxl = "^3.1.5" lxml = ">=4.0.0,<6.0.0" ocrmac = { version = "^1.0.0", markers = "sys_platform == 'darwin'", optional = true } +rapidocr-onnxruntime = { version = "^1.4.0", optional = true, markers = "python_version < '3.13'" } +onnxruntime = [ + # 1.19.2 is the last version with python3.9 support, + # see https://github.com/microsoft/onnxruntime/releases/tag/v1.20.0 + { version = ">=1.7.0,<1.20.0", optional = true, markers = "python_version < '3.10'" }, + { version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" } +] [tool.poetry.group.dev.dependencies] black = {extras = ["jupyter"], version = "^24.4.2"} @@ -104,6 +111,7 @@ torchvision = [ [tool.poetry.extras] tesserocr = ["tesserocr"] ocrmac = ["ocrmac"] +rapidocr = ["rapidocr-onnxruntime", "onnxruntime"] [tool.poetry.scripts] docling = "docling.cli.main:app" diff --git a/tests/test_e2e_ocr_conversion.py b/tests/test_e2e_ocr_conversion.py index 68dac33e..73a943af 100644 --- a/tests/test_e2e_ocr_conversion.py +++ b/tests/test_e2e_ocr_conversion.py @@ -10,6 +10,7 @@ OcrMacOptions, OcrOptions, PdfPipelineOptions, + RapidOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, ) @@ -56,9 +57,11 @@ def test_e2e_conversions(): EasyOcrOptions(), TesseractOcrOptions(), TesseractCliOcrOptions(), + RapidOcrOptions(), EasyOcrOptions(force_full_page_ocr=True), TesseractOcrOptions(force_full_page_ocr=True), TesseractCliOcrOptions(force_full_page_ocr=True), + RapidOcrOptions(force_full_page_ocr=True), ] # only works on mac