Knowledge-Innovation-Centre · anthonycamilleri · Jan 4, 2026 · Jan 4, 2026
diff --git a/README.md b/README.md
@@ -276,7 +276,7 @@ Each component can be used independently or through the unified `DocumentProcess
 - pdfminer.six - PDF text extraction
 - pdf2image - PDF to image conversion
 - pytesseract - OCR engine
-- opencv-python - Image preprocessing
+- scikit-image - Image preprocessing
 - Pillow - Image handling
 - python-docx - DOCX extraction
 - python-pptx - PPTX extraction

diff --git a/docprocessor/core/ocr.py b/docprocessor/core/ocr.py
@@ -6,9 +6,9 @@
 from io import BytesIO
 from typing import List
 
-import cv2
 import numpy as np
 import pytesseract
+from skimage.filters import threshold_otsu
 from pdf2image import convert_from_bytes
 from pdfminer.high_level import extract_pages
 from pdfminer.layout import LAParams, LTTextBox, LTTextLine
@@ -190,9 +190,10 @@ def is_likely_table_column(element: TextElement, all_elements: List[TextElement]
 
 def preprocess_image(image: Image.Image) -> Image.Image:
     """
-    Optimize image for OCR: grayscale and threshold.
+    Optimize image for OCR: grayscale and Otsu thresholding.
     """
-    gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
-    # Apply Otsu thresholding
-    thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
-    return Image.fromarray(thresh)
+    gray = image.convert("L")
+    gray_array = np.array(gray)
+    thresh_value = threshold_otsu(gray_array)
+    binary = (gray_array > thresh_value).astype(np.uint8) * 255
+    return Image.fromarray(binary)
diff --git a/docs/installation.rst b/docs/installation.rst
@@ -76,7 +76,7 @@ Meilisearch is installed by default. If you don't need it:
 .. code-block:: bash
 
     pip install docprocessor --no-deps
-    pip install pdfminer.six pdf2image pytesseract opencv-python Pillow python-docx langchain-text-splitters tiktoken
+    pip install pdfminer.six pdf2image pytesseract scikit-image Pillow python-docx langchain-text-splitters tiktoken
 
 Development Tools
 ~~~~~~~~~~~~~~~~~

diff --git a/pyproject.toml b/pyproject.toml
@@ -34,7 +34,7 @@ dependencies = [
     "pdf2image>=1.17.0",
     "pillow>=11.0.0",
     "pytesseract>=0.3.13",
-    "opencv-python>=4.8.0",
+    "scikit-image>=0.22.0",
     "python-docx>=1.1.0",
 ]
 

diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,7 @@
 pdfminer.six>=20221105
 pdf2image>=1.16.3
 pytesseract>=0.3.10
-opencv-python>=4.9.0
+scikit-image>=0.22.0
 Pillow>=10.2.0
 python-docx>=1.1.0
 python-pptx>=0.6.21

diff --git a/setup.py b/setup.py
@@ -29,7 +29,7 @@
         "pdfminer.six>=20221105",
         "pdf2image>=1.16.3",
         "pytesseract>=0.3.10",
-        "opencv-python>=4.9.0",
+        "scikit-image>=0.22.0",
         "Pillow>=10.2.0",
         "python-docx>=1.1.0",
         "python-pptx>=0.6.21",