Skip to content

Commit

Permalink
feat(ocr): added support for RapidOCR engine (#415)
Browse files Browse the repository at this point in the history
* adding rapidocr engine for ocr in docling

Signed-off-by: swayam-singhal <[email protected]>

* fixing styling format

Signed-off-by: Swaymaw <[email protected]>

* updating pyproject.toml and poetry.lock to fix ci bugs

Signed-off-by: Swaymaw <[email protected]>

* help poetry pinning for python3.9

Signed-off-by: Michele Dolfi <[email protected]>

* simplifying rapidocr options so that device can be changed using a single option for all models

Signed-off-by: Swaymaw <[email protected]>

* fix styling issues and small bug in rapidOcrOptions

Signed-off-by: Swaymaw <[email protected]>

* use default device until we enable global management

Signed-off-by: Michele Dolfi <[email protected]>

---------

Signed-off-by: swayam-singhal <[email protected]>
Signed-off-by: Swaymaw <[email protected]>
Signed-off-by: Michele Dolfi <[email protected]>
Co-authored-by: swayam-singhal <[email protected]>
Co-authored-by: Michele Dolfi <[email protected]>
  • Loading branch information
3 people authored Nov 27, 2024
1 parent 767563b commit 85b2999
Show file tree
Hide file tree
Showing 9 changed files with 405 additions and 13 deletions.
4 changes: 4 additions & 0 deletions docling/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
OcrMacOptions,
OcrOptions,
PdfPipelineOptions,
RapidOcrOptions,
TableFormerMode,
TesseractCliOcrOptions,
TesseractOcrOptions,
Expand Down Expand Up @@ -76,6 +77,7 @@ class OcrEngine(str, Enum):
TESSERACT_CLI = "tesseract_cli"
TESSERACT = "tesseract"
OCRMAC = "ocrmac"
RAPIDOCR = "rapidocr"


def export_documents(
Expand Down Expand Up @@ -262,6 +264,8 @@ def convert(
ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
elif ocr_engine == OcrEngine.OCRMAC:
ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
elif ocr_engine == OcrEngine.RAPIDOCR:
ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
else:
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")

Expand Down
36 changes: 36 additions & 0 deletions docling/datamodel/pipeline_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,42 @@ class OcrOptions(BaseModel):
)


class RapidOcrOptions(OcrOptions):
kind: Literal["rapidocr"] = "rapidocr"

# English and chinese are the most commly used models and have been tested with RapidOCR.
lang: List[str] = [
"english",
"chinese",
] # However, language as a parameter is not supported by rapidocr yet and hence changing this options doesn't affect anything.
# For more details on supported languages by RapidOCR visit https://rapidai.github.io/RapidOCRDocs/blog/2022/09/28/%E6%94%AF%E6%8C%81%E8%AF%86%E5%88%AB%E8%AF%AD%E8%A8%80/

# For more details on the following options visit https://rapidai.github.io/RapidOCRDocs/install_usage/api/RapidOCR/
text_score: float = 0.5 # same default as rapidocr

use_det: Optional[bool] = None # same default as rapidocr
use_cls: Optional[bool] = None # same default as rapidocr
use_rec: Optional[bool] = None # same default as rapidocr

# class Device(Enum):
# CPU = "CPU"
# CUDA = "CUDA"
# DIRECTML = "DIRECTML"
# AUTO = "AUTO"

# device: Device = Device.AUTO # Default value is AUTO

print_verbose: bool = False # same default as rapidocr

det_model_path: Optional[str] = None # same default as rapidocr
cls_model_path: Optional[str] = None # same default as rapidocr
rec_model_path: Optional[str] = None # same default as rapidocr

model_config = ConfigDict(
extra="forbid",
)


class EasyOcrOptions(OcrOptions):
kind: Literal["easyocr"] = "easyocr"
lang: List[str] = ["fr", "de", "es", "en"]
Expand Down
147 changes: 147 additions & 0 deletions docling/models/rapid_ocr_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
import logging
from typing import Iterable

import numpy
from docling_core.types.doc import BoundingBox, CoordOrigin

from docling.datamodel.base_models import OcrCell, Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import RapidOcrOptions
from docling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.profiling import TimeRecorder

_log = logging.getLogger(__name__)


class RapidOcrModel(BaseOcrModel):
def __init__(self, enabled: bool, options: RapidOcrOptions):
super().__init__(enabled=enabled, options=options)
self.options: RapidOcrOptions

self.scale = 3 # multiplier for 72 dpi == 216 dpi.

if self.enabled:
try:
from rapidocr_onnxruntime import RapidOCR # type: ignore
except ImportError:
raise ImportError(
"RapidOCR is not installed. Please install it via `pip install rapidocr_onnxruntime` to use this OCR engine. "
"Alternatively, Docling has support for other OCR engines. See the documentation."
)

# This configuration option will be revamped while introducing device settings for all models.
# For the moment we will default to auto and let onnx-runtime pick the best.
cls_use_cuda = True
rec_use_cuda = True
det_use_cuda = True
det_use_dml = True
cls_use_dml = True
rec_use_dml = True

# # Same as Defaults in RapidOCR
# cls_use_cuda = False
# rec_use_cuda = False
# det_use_cuda = False
# det_use_dml = False
# cls_use_dml = False
# rec_use_dml = False

# # If we set everything to true onnx-runtime would automatically choose the fastest accelerator
# if self.options.device == self.options.Device.AUTO:
# cls_use_cuda = True
# rec_use_cuda = True
# det_use_cuda = True
# det_use_dml = True
# cls_use_dml = True
# rec_use_dml = True

# # If we set use_cuda to true onnx would use the cuda device available in runtime if no cuda device is available it would run on CPU.
# elif self.options.device == self.options.Device.CUDA:
# cls_use_cuda = True
# rec_use_cuda = True
# det_use_cuda = True

# # If we set use_dml to true onnx would use the dml device available in runtime if no dml device is available it would work on CPU.
# elif self.options.device == self.options.Device.DIRECTML:
# det_use_dml = True
# cls_use_dml = True
# rec_use_dml = True

self.reader = RapidOCR(
text_score=self.options.text_score,
cls_use_cuda=cls_use_cuda,
rec_use_cuda=rec_use_cuda,
det_use_cuda=det_use_cuda,
det_use_dml=det_use_dml,
cls_use_dml=cls_use_dml,
rec_use_dml=rec_use_dml,
print_verbose=self.options.print_verbose,
det_model_path=self.options.det_model_path,
cls_model_path=self.options.cls_model_path,
rec_model_path=self.options.rec_model_path,
)

def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]:

if not self.enabled:
yield from page_batch
return

for page in page_batch:

assert page._backend is not None
if not page._backend.is_valid():
yield page
else:
with TimeRecorder(conv_res, "ocr"):
ocr_rects = self.get_ocr_rects(page)

all_ocr_cells = []
for ocr_rect in ocr_rects:
# Skip zero area boxes
if ocr_rect.area() == 0:
continue
high_res_image = page._backend.get_page_image(
scale=self.scale, cropbox=ocr_rect
)
im = numpy.array(high_res_image)
result, _ = self.reader(
im,
use_det=self.options.use_det,
use_cls=self.options.use_cls,
use_rec=self.options.use_rec,
)

del high_res_image
del im

cells = [
OcrCell(
id=ix,
text=line[1],
confidence=line[2],
bbox=BoundingBox.from_tuple(
coord=(
(line[0][0][0] / self.scale) + ocr_rect.l,
(line[0][0][1] / self.scale) + ocr_rect.t,
(line[0][2][0] / self.scale) + ocr_rect.l,
(line[0][2][1] / self.scale) + ocr_rect.t,
),
origin=CoordOrigin.TOPLEFT,
),
)
for ix, line in enumerate(result)
]
all_ocr_cells.extend(cells)

# Post-process the cells
page.cells = self.post_process_cells(all_ocr_cells, page.cells)

# DEBUG code:
if settings.debug.visualize_ocr:
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)

yield page
7 changes: 7 additions & 0 deletions docling/pipeline/standard_pdf_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
EasyOcrOptions,
OcrMacOptions,
PdfPipelineOptions,
RapidOcrOptions,
TesseractCliOcrOptions,
TesseractOcrOptions,
)
Expand All @@ -26,6 +27,7 @@
PagePreprocessingModel,
PagePreprocessingOptions,
)
from docling.models.rapid_ocr_model import RapidOcrModel
from docling.models.table_structure_model import TableStructureModel
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
from docling.models.tesseract_ocr_model import TesseractOcrModel
Expand Down Expand Up @@ -121,6 +123,11 @@ def get_ocr_model(self) -> Optional[BaseOcrModel]:
enabled=self.pipeline_options.do_ocr,
options=self.pipeline_options.ocr_options,
)
elif isinstance(self.pipeline_options.ocr_options, RapidOcrOptions):
return RapidOcrModel(
enabled=self.pipeline_options.do_ocr,
options=self.pipeline_options.ocr_options,
)
elif isinstance(self.pipeline_options.ocr_options, OcrMacOptions):
if "darwin" != sys.platform:
raise RuntimeError(
Expand Down
4 changes: 3 additions & 1 deletion docs/examples/full_page_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
EasyOcrOptions,
OcrMacOptions,
PdfPipelineOptions,
RapidOcrOptions,
TesseractCliOcrOptions,
TesseractOcrOptions,
)
Expand All @@ -20,10 +21,11 @@ def main():
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True

# Any of the OCR options can be used:EasyOcrOptions, TesseractOcrOptions, TesseractCliOcrOptions, OcrMacOptions(Mac only)
# Any of the OCR options can be used:EasyOcrOptions, TesseractOcrOptions, TesseractCliOcrOptions, OcrMacOptions(Mac only), RapidOcrOptions
# ocr_options = EasyOcrOptions(force_full_page_ocr=True)
# ocr_options = TesseractOcrOptions(force_full_page_ocr=True)
# ocr_options = OcrMacOptions(force_full_page_ocr=True)
# ocr_options = RapidOcrOptions(force_full_page_ocr=True)
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True)
pipeline_options.ocr_options = ocr_options

Expand Down
1 change: 1 addition & 0 deletions docs/installation.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ Works on macOS, Linux, and Windows, with support for both x86_64 and arm64 archi
| Tesseract | System dependency. See description for Tesseract and Tesserocr below. | `TesseractOcrOptions` |
| Tesseract CLI | System dependency. See description below. | `TesseractCliOcrOptions` |
| OcrMac | System dependency. See description below. | `OcrMacOptions` |
| [RapidOCR](https://github.com/RapidAI/RapidOCR) | Extra feature not included in Default Docling installation can be installed via `pip install rapidocr_onnxruntime` | `RapidOcrOptions` |

The Docling `DocumentConverter` allows to choose the OCR engine with the `ocr_options` settings. For example

Expand Down
Loading

0 comments on commit 85b2999

Please sign in to comment.