|
| 1 | +from attrs import define, field |
| 2 | +from typing import Any, Optional |
| 3 | +from .aws_text import AwsTextOCR |
| 4 | +from .gcp_text import GcpTextOCR |
| 5 | +from .tesseract_text import TesseractTextOCR |
| 6 | +from ...utils import BackendNotSupported |
| 7 | + |
| 8 | +__all__ = ["TextParser"] |
| 9 | + |
| 10 | + |
| 11 | +@define |
| 12 | +class TextParser: |
| 13 | + """ |
| 14 | + High level interface for multiple text ocr backends. |
| 15 | + Note: Currently only supports Pytesseract, Google Cloud Vision and Amazon Textract. |
| 16 | +
|
| 17 | + Attributes |
| 18 | + ---------- |
| 19 | + backend : str |
| 20 | + The name of the backend to use. |
| 21 | + default: "pytesseract" |
| 22 | + alternative options: "pytesseract", "aws-textract", "google-cloud-vision" |
| 23 | + reader : Any |
| 24 | + The reader object to use. |
| 25 | + credentials : Optional[str] |
| 26 | + The credentials to use for the selected backend. |
| 27 | + default: None |
| 28 | + """ |
| 29 | + |
| 30 | + reader: Any = field() |
| 31 | + credentials: Optional[str] = field(default=None) |
| 32 | + backend: str = field(default="pytesseract") |
| 33 | + |
| 34 | + @backend.validator |
| 35 | + def supported_backends(self, attribute, value): |
| 36 | + _backends = ["pytesseract", "aws-textract", "google-cloud-vision"] |
| 37 | + if value not in _backends: |
| 38 | + raise BackendNotSupported( |
| 39 | + f"backend type {value} not supported. choose one of these instead: {', '.join(_backends)}" |
| 40 | + ) |
| 41 | + |
| 42 | + def _dispatch_parser(self): |
| 43 | + parser_registry = { |
| 44 | + "pytesseract": TesseractTextOCR, |
| 45 | + "aws-textract": AwsTextOCR, |
| 46 | + "google-cloud-vision": GcpTextOCR, |
| 47 | + } |
| 48 | + |
| 49 | + return parser_registry[self.backend] |
| 50 | + |
| 51 | + def parse(self): |
| 52 | + parser = self._dispatch_parser()(self.reader, self.credentials) |
| 53 | + parsed_doc = parser.parse() |
| 54 | + return parsed_doc |
0 commit comments