-
Notifications
You must be signed in to change notification settings - Fork 1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Introduce automatic language detection in TesseractOcrCliModel (#…
…800) * feat: Introduce automatic language detection in tesseract_ocr_cli model. Extend unit tests. Signed-off-by: Nikos Livathinos <[email protected]> * docs: Add example how to use "auto" language with tesseract OCR engines Signed-off-by: Nikos Livathinos <[email protected]> * fix: Refactor the TesseractOcrModel and TesseractOcrCliModel to validate if the auto-detected language is installed in the system and if not fall back to a default option without language. Signed-off-by: Nikos Livathinos <[email protected]> --------- Signed-off-by: Nikos Livathinos <[email protected]>
- Loading branch information
1 parent
9e4ca90
commit 3be2fb5
Showing
6 changed files
with
157 additions
and
41 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
def map_tesseract_script(script: str) -> str: | ||
r""" """ | ||
if script == "Katakana" or script == "Hiragana": | ||
script = "Japanese" | ||
elif script == "Han": | ||
script = "HanS" | ||
elif script == "Korean": | ||
script = "Hangul" | ||
return script |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
from pathlib import Path | ||
|
||
from docling.datamodel.base_models import InputFormat | ||
from docling.datamodel.pipeline_options import ( | ||
PdfPipelineOptions, | ||
TesseractCliOcrOptions, | ||
TesseractOcrOptions, | ||
) | ||
from docling.document_converter import DocumentConverter, PdfFormatOption | ||
|
||
|
||
def main(): | ||
input_doc = Path("./tests/data/2206.01062.pdf") | ||
|
||
# Set lang=["auto"] with a tesseract OCR engine: TesseractOcrOptions, TesseractCliOcrOptions | ||
# ocr_options = TesseractOcrOptions(lang=["auto"]) | ||
ocr_options = TesseractCliOcrOptions(lang=["auto"]) | ||
|
||
pipeline_options = PdfPipelineOptions( | ||
do_ocr=True, force_full_page_ocr=True, ocr_options=ocr_options | ||
) | ||
|
||
converter = DocumentConverter( | ||
format_options={ | ||
InputFormat.PDF: PdfFormatOption( | ||
pipeline_options=pipeline_options, | ||
) | ||
} | ||
) | ||
|
||
doc = converter.convert(input_doc).document | ||
md = doc.export_to_markdown() | ||
print(md) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters