docs: Add example how to use "auto" language with tesseract OCR engines

Signed-off-by: Nikos Livathinos <[email protected]>
DS4SD · Jan 24, 2025 · cdb57e0 · cdb57e0
1 parent 4c2552e
commit cdb57e0
Show file tree

Hide file tree

Showing 2 changed files with 36 additions and 0 deletions.
diff --git a/docs/examples/tesseract_lang_detection.py b/docs/examples/tesseract_lang_detection.py
@@ -0,0 +1,35 @@
+from pathlib import Path
+
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import (
+    PdfPipelineOptions,
+    TesseractCliOcrOptions,
+    TesseractOcrOptions,
+)
+from docling.document_converter import DocumentConverter, PdfFormatOption
+
+
+def main():
+    input_doc = Path("./tests/data/2206.01062.pdf")
+
+    # Set lang=["auto"] with a tesseract OCR engine: TesseractOcrOptions, TesseractCliOcrOptions
+    # ocr_options = TesseractOcrOptions(lang=["auto"])
+    ocr_options = TesseractCliOcrOptions(lang=["auto"])
+
+    pipeline_options = PdfPipelineOptions(do_ocr=True, ocr_options=ocr_options)
+
+    converter = DocumentConverter(
+        format_options={
+            InputFormat.PDF: PdfFormatOption(
+                pipeline_options=pipeline_options,
+            )
+        }
+    )
+
+    doc = converter.convert(input_doc).document
+    md = doc.export_to_markdown()
+    print(md)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -75,6 +75,7 @@ nav:
       - "Table export": examples/export_tables.py
       - "Multimodal export": examples/export_multimodal.py
       - "Force full page OCR": examples/full_page_ocr.py
+      - "Automatic OCR language detection with tesseract": examples/tesseract_lang_detection.py
       - "Accelerator options": examples/run_with_accelerator.py
       - "Simple translation": examples/translate.py      
     - ✂️ Chunking: