From b346faf622190c4895dffdc1ee2365b3f7808cbc Mon Sep 17 00:00:00 2001
From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
Date: Fri, 18 Oct 2024 13:58:23 +0200
Subject: [PATCH] feat: add coverage_threshold to skip OCR for small images
 (#161)

* feat: add coverage_threshold to skip OCR for small images

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* filter individual boxes

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* rename option

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
---
 docling/datamodel/pipeline_options.py |  3 +++
 docling/models/base_ocr_model.py      | 10 +++++++++-
 tests/test_options.py                 | 20 ++++++++++++++++++++
 3 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
index 637b0c0e..efaa6ff8 100644
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -22,6 +22,9 @@ class TableStructureOptions(BaseModel):
 
 class OcrOptions(BaseModel):
     kind: str
+    bitmap_area_threshold: float = (
+        0.05  # percentage of the area for a bitmap to processed with OCR
+    )
 
 
 class EasyOcrOptions(OcrOptions):
diff --git a/docling/models/base_ocr_model.py b/docling/models/base_ocr_model.py
index 59ae2295..da6860a8 100644
--- a/docling/models/base_ocr_model.py
+++ b/docling/models/base_ocr_model.py
@@ -69,7 +69,7 @@ def find_ocr_rects(size, bitmap_rects):
         coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
 
         # return full-page rectangle if sufficiently covered with bitmaps
-        if coverage > BITMAP_COVERAGE_TRESHOLD:
+        if coverage > max(BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold):
             return [
                 BoundingBox(
                     l=0,
@@ -81,6 +81,14 @@ def find_ocr_rects(size, bitmap_rects):
             ]
         # return individual rectangles if the bitmap coverage is smaller
         else:  # coverage <= BITMAP_COVERAGE_TRESHOLD:
+
+            # skip OCR if the bitmap area on the page is smaller than the options threshold
+            ocr_rects = [
+                rect
+                for rect in ocr_rects
+                if rect.area() / (page.size.width * page.size.height)
+                > self.options.bitmap_area_threshold
+            ]
             return ocr_rects
 
     # Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
diff --git a/tests/test_options.py b/tests/test_options.py
index ad6c7a45..c53570cc 100644
--- a/tests/test_options.py
+++ b/tests/test_options.py
@@ -42,3 +42,23 @@ def test_e2e_conversions(test_doc_path):
         doc_result: ConversionResult = converter.convert(test_doc_path)
 
         assert doc_result.status == ConversionStatus.SUCCESS
+
+
+def test_ocr_coverage_threshold(test_doc_path):
+    pipeline_options = PdfPipelineOptions()
+    pipeline_options.do_ocr = True
+    pipeline_options.ocr_options.bitmap_area_threshold = 1.1
+
+    converter = DocumentConverter(
+        format_options={
+            InputFormat.PDF: PdfFormatOption(
+                pipeline_options=pipeline_options,
+            )
+        }
+    )
+
+    test_doc_path = Path("./tests/data_scanned/ocr_test.pdf")
+    doc_result: ConversionResult = converter.convert(test_doc_path)
+
+    # this should have generated no results, since we set a very high threshold
+    assert len(doc_result.document.texts) == 0