diff --git a/README.md b/README.md index 1c69876..ca4a6e9 100644 --- a/README.md +++ b/README.md @@ -276,7 +276,7 @@ Each component can be used independently or through the unified `DocumentProcess - pdfminer.six - PDF text extraction - pdf2image - PDF to image conversion - pytesseract - OCR engine -- opencv-python - Image preprocessing +- scikit-image - Image preprocessing - Pillow - Image handling - python-docx - DOCX extraction - python-pptx - PPTX extraction diff --git a/docprocessor/core/ocr.py b/docprocessor/core/ocr.py index 2e07603..822933d 100644 --- a/docprocessor/core/ocr.py +++ b/docprocessor/core/ocr.py @@ -6,9 +6,9 @@ from io import BytesIO from typing import List -import cv2 import numpy as np import pytesseract +from skimage.filters import threshold_otsu from pdf2image import convert_from_bytes from pdfminer.high_level import extract_pages from pdfminer.layout import LAParams, LTTextBox, LTTextLine @@ -190,9 +190,10 @@ def is_likely_table_column(element: TextElement, all_elements: List[TextElement] def preprocess_image(image: Image.Image) -> Image.Image: """ - Optimize image for OCR: grayscale and threshold. + Optimize image for OCR: grayscale and Otsu thresholding. """ - gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY) - # Apply Otsu thresholding - thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1] - return Image.fromarray(thresh) + gray = image.convert("L") + gray_array = np.array(gray) + thresh_value = threshold_otsu(gray_array) + binary = (gray_array > thresh_value).astype(np.uint8) * 255 + return Image.fromarray(binary) diff --git a/docs/installation.rst b/docs/installation.rst index 7f79e82..b9bfe3d 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -76,7 +76,7 @@ Meilisearch is installed by default. If you don't need it: .. code-block:: bash pip install docprocessor --no-deps - pip install pdfminer.six pdf2image pytesseract opencv-python Pillow python-docx langchain-text-splitters tiktoken + pip install pdfminer.six pdf2image pytesseract scikit-image Pillow python-docx langchain-text-splitters tiktoken Development Tools ~~~~~~~~~~~~~~~~~ diff --git a/pyproject.toml b/pyproject.toml index 57f3bd0..144d136 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,7 +34,7 @@ dependencies = [ "pdf2image>=1.17.0", "pillow>=11.0.0", "pytesseract>=0.3.13", - "opencv-python>=4.8.0", + "scikit-image>=0.22.0", "python-docx>=1.1.0", ] diff --git a/requirements.txt b/requirements.txt index 773420e..ca148d9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ pdfminer.six>=20221105 pdf2image>=1.16.3 pytesseract>=0.3.10 -opencv-python>=4.9.0 +scikit-image>=0.22.0 Pillow>=10.2.0 python-docx>=1.1.0 python-pptx>=0.6.21 diff --git a/setup.py b/setup.py index 20f48ba..76e5886 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ "pdfminer.six>=20221105", "pdf2image>=1.16.3", "pytesseract>=0.3.10", - "opencv-python>=4.9.0", + "scikit-image>=0.22.0", "Pillow>=10.2.0", "python-docx>=1.1.0", "python-pptx>=0.6.21",