Skip to content

Commit

Permalink
fix: python3.9 support (#396)
Browse files Browse the repository at this point in the history
* fixes for python3.9

Signed-off-by: Michele Dolfi <[email protected]>

* pin docling-parse with python3.9 wheels

Signed-off-by: Michele Dolfi <[email protected]>

* update deps

Signed-off-by: Michele Dolfi <[email protected]>

---------

Signed-off-by: Michele Dolfi <[email protected]>
  • Loading branch information
dolfim-ibm authored Nov 20, 2024
1 parent 6efa96c commit 7b013ab
Show file tree
Hide file tree
Showing 6 changed files with 241 additions and 250 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/checks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.10', '3.11', '3.12']
python-version: ['3.9', '3.10', '3.11', '3.12']
steps:
- uses: actions/checkout@v3
- name: Install tesseract
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
[![arXiv](https://img.shields.io/badge/arXiv-2408.09869-b31b1b.svg)](https://arxiv.org/abs/2408.09869)
[![Docs](https://img.shields.io/badge/docs-live-brightgreen)](https://ds4sd.github.io/docling/)
[![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/)
![Python](https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue)
[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/docling)](https://pypi.org/project/docling/)
[![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/)
[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
[![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/)
Expand Down
38 changes: 18 additions & 20 deletions docling/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,17 +254,16 @@ def convert(
export_txt = OutputFormat.TEXT in to_formats
export_doctags = OutputFormat.DOCTAGS in to_formats

match ocr_engine:
case OcrEngine.EASYOCR:
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
case OcrEngine.TESSERACT_CLI:
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
case OcrEngine.TESSERACT:
ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
case OcrEngine.OCRMAC:
ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
case _:
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
if ocr_engine == OcrEngine.EASYOCR:
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
elif ocr_engine == OcrEngine.TESSERACT_CLI:
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
elif ocr_engine == OcrEngine.TESSERACT:
ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
elif ocr_engine == OcrEngine.OCRMAC:
ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
else:
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")

ocr_lang_list = _split_list(ocr_lang)
if ocr_lang_list is not None:
Expand All @@ -281,15 +280,14 @@ def convert(
if artifacts_path is not None:
pipeline_options.artifacts_path = artifacts_path

match pdf_backend:
case PdfBackend.DLPARSE_V1:
backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
case PdfBackend.DLPARSE_V2:
backend = DoclingParseV2DocumentBackend
case PdfBackend.PYPDFIUM2:
backend = PyPdfiumDocumentBackend
case _:
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
if pdf_backend == PdfBackend.DLPARSE_V1:
backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
elif pdf_backend == PdfBackend.DLPARSE_V2:
backend = DoclingParseV2DocumentBackend
elif pdf_backend == PdfBackend.PYPDFIUM2:
backend = PyPdfiumDocumentBackend
else:
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")

format_options: Dict[InputFormat, FormatOption] = {
InputFormat.PDF: PdfFormatOption(
Expand Down
6 changes: 3 additions & 3 deletions docling/document_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import time
from functools import partial
from pathlib import Path
from typing import Dict, Iterable, Iterator, List, Optional, Type
from typing import Dict, Iterable, Iterator, List, Optional, Type, Union

from pydantic import BaseModel, ConfigDict, model_validator, validate_call

Expand Down Expand Up @@ -155,7 +155,7 @@ def initialize_pipeline(self, format: InputFormat):
@validate_call(config=ConfigDict(strict=True))
def convert(
self,
source: Path | str | DocumentStream, # TODO review naming
source: Union[Path, str, DocumentStream], # TODO review naming
raises_on_error: bool = True,
max_num_pages: int = sys.maxsize,
max_file_size: int = sys.maxsize,
Expand All @@ -172,7 +172,7 @@ def convert(
@validate_call(config=ConfigDict(strict=True))
def convert_all(
self,
source: Iterable[Path | str | DocumentStream], # TODO review naming
source: Iterable[Union[Path, str, DocumentStream]], # TODO review naming
raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error
max_num_pages: int = sys.maxsize,
max_file_size: int = sys.maxsize,
Expand Down
Loading

0 comments on commit 7b013ab

Please sign in to comment.