Skip to content

Commit 06b4ebf

Browse files
authored
Merge pull request #10 from maxent-ai/pipeline
Pipeline
2 parents e70afa3 + 4166192 commit 06b4ebf

File tree

5 files changed

+62
-3
lines changed

5 files changed

+62
-3
lines changed

ocrpy/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = "0.3.3"
1+
__version__ = "0.3.4"
22

33
from .io import *
44
from .parsers import *

ocrpy/io/reader.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ class DocumentReader:
3333

3434
file: str = field()
3535
credentials: str = field(default=None)
36-
storage_type = field(default=None, init=False)
36+
storage_type: str = field(default=None, init=False)
3737

3838
def __attrs_post_init__(self):
3939
self.storage_type = guess_storage(self.file)

ocrpy/parsers/text/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
from .gcp_text import *
22
from .aws_text import *
33
from .tesseract_text import *
4+
from .text_parser import *

ocrpy/parsers/text/text_parser.py

+54
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
from attrs import define, field
2+
from typing import Any, Optional
3+
from .aws_text import AwsTextOCR
4+
from .gcp_text import GcpTextOCR
5+
from .tesseract_text import TesseractTextOCR
6+
from ...utils import BackendNotSupported
7+
8+
__all__ = ["TextParser"]
9+
10+
11+
@define
12+
class TextParser:
13+
"""
14+
High level interface for multiple text ocr backends.
15+
Note: Currently only supports Pytesseract, Google Cloud Vision and Amazon Textract.
16+
17+
Attributes
18+
----------
19+
backend : str
20+
The name of the backend to use.
21+
default: "pytesseract"
22+
alternative options: "pytesseract", "aws-textract", "google-cloud-vision"
23+
reader : Any
24+
The reader object to use.
25+
credentials : Optional[str]
26+
The credentials to use for the selected backend.
27+
default: None
28+
"""
29+
30+
reader: Any = field()
31+
credentials: Optional[str] = field(default=None)
32+
backend: str = field(default="pytesseract")
33+
34+
@backend.validator
35+
def supported_backends(self, attribute, value):
36+
_backends = ["pytesseract", "aws-textract", "google-cloud-vision"]
37+
if value not in _backends:
38+
raise BackendNotSupported(
39+
f"backend type {value} not supported. choose one of these instead: {', '.join(_backends)}"
40+
)
41+
42+
def _dispatch_parser(self):
43+
parser_registry = {
44+
"pytesseract": TesseractTextOCR,
45+
"aws-textract": AwsTextOCR,
46+
"google-cloud-vision": GcpTextOCR,
47+
}
48+
49+
return parser_registry[self.backend]
50+
51+
def parse(self):
52+
parser = self._dispatch_parser()(self.reader, self.credentials)
53+
parsed_doc = parser.parse()
54+
return parsed_doc

ocrpy/utils/exceptions.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__all__ = ["FileTypeNotSupported", "AttributeNotSupported"]
1+
__all__ = ["FileTypeNotSupported", "AttributeNotSupported", "BackendNotSupported"]
22

33

44
class FileTypeNotSupported(Exception):
@@ -11,3 +11,7 @@ class AttributeNotSupported(Exception):
1111
"""Raise when an Attribute like block or line extraction is not supported by the backends."""
1212

1313
pass
14+
15+
16+
class BackendNotSupported(Exception):
17+
pass

0 commit comments

Comments
 (0)