Skip to content

Commit

Permalink
fix: Improve OCR results, stricten criteria before dropping bitmap ar…
Browse files Browse the repository at this point in the history
…eas (#719)

fix: Properly care for all bitmap elements in OCR

Signed-off-by: Christoph Auer <[email protected]>
  • Loading branch information
cau-git authored Jan 10, 2025
1 parent 9a6b5c8 commit 5a060f2
Show file tree
Hide file tree
Showing 5 changed files with 19 additions and 16 deletions.
2 changes: 1 addition & 1 deletion docling/backend/docling_parse_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def draw_clusters_and_cells():
return cells

def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
AREA_THRESHOLD = 32 * 32
AREA_THRESHOLD = 0 # 32 * 32

for i in range(len(self._dpage["images"])):
bitmap = self._dpage["images"][i]
Expand Down
2 changes: 1 addition & 1 deletion docling/backend/docling_parse_v2_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ def draw_clusters_and_cells():
return cells

def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
AREA_THRESHOLD = 32 * 32
AREA_THRESHOLD = 0 # 32 * 32

images = self._dpage["sanitized"]["images"]["data"]
images_header = self._dpage["sanitized"]["images"]["header"]
Expand Down
2 changes: 1 addition & 1 deletion docling/backend/pypdfium2_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def is_valid(self) -> bool:
return self.valid

def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
AREA_THRESHOLD = 32 * 32
AREA_THRESHOLD = 0 # 32 * 32
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
pos = obj.get_pos()
cropbox = BoundingBox.from_tuple(
Expand Down
2 changes: 1 addition & 1 deletion docling/datamodel/pipeline_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ class EasyOcrOptions(OcrOptions):

use_gpu: Optional[bool] = None

confidence_threshold: float = 0.65
confidence_threshold: float = 0.5

model_storage_directory: Optional[str] = None
recog_network: Optional[str] = "standard"
Expand Down
27 changes: 15 additions & 12 deletions docling/models/base_ocr_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from docling_core.types.doc import BoundingBox, CoordOrigin
from PIL import Image, ImageDraw
from rtree import index
from scipy.ndimage import find_objects, label
from scipy.ndimage import binary_dilation, find_objects, label

from docling.datamodel.base_models import Cell, OcrCell, Page
from docling.datamodel.document import ConversionResult
Expand Down Expand Up @@ -43,6 +43,12 @@ def find_ocr_rects(size, bitmap_rects):

np_image = np.array(image)

# Dilate the image by 10 pixels to merge nearby bitmap rectangles
structure = np.ones(
(20, 20)
) # Create a 20x20 structure element (10 pixels in all directions)
np_image = binary_dilation(np_image > 0, structure=structure)

# Find the connected components
labeled_image, num_features = label(
np_image > 0
Expand Down Expand Up @@ -72,7 +78,7 @@ def find_ocr_rects(size, bitmap_rects):
bitmap_rects = []
coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)

# return full-page rectangle if sufficiently covered with bitmaps
# return full-page rectangle if page is dominantly covered with bitmaps
if self.options.force_full_page_ocr or coverage > max(
BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold
):
Expand All @@ -85,17 +91,11 @@ def find_ocr_rects(size, bitmap_rects):
coord_origin=CoordOrigin.TOPLEFT,
)
]
# return individual rectangles if the bitmap coverage is smaller
else: # coverage <= BITMAP_COVERAGE_TRESHOLD:

# skip OCR if the bitmap area on the page is smaller than the options threshold
ocr_rects = [
rect
for rect in ocr_rects
if rect.area() / (page.size.width * page.size.height)
> self.options.bitmap_area_threshold
]
# return individual rectangles if the bitmap coverage is above the threshold
elif coverage > self.options.bitmap_area_threshold:
return ocr_rects
else: # overall coverage of bitmaps is too low, drop all bitmap rectangles.
return []

# Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
def _filter_ocr_cells(self, ocr_cells, programmatic_cells):
Expand Down Expand Up @@ -162,6 +162,9 @@ def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False
x0 *= scale_x
x1 *= scale_x

if y1 <= y0:
y1, y0 = y0, y1

color = "gray"
if isinstance(tc, OcrCell):
color = "magenta"
Expand Down

0 comments on commit 5a060f2

Please sign in to comment.