diff --git a/docling/backend/docling_parse_backend.py b/docling/backend/docling_parse_backend.py index bb1fe058..89b25ee1 100644 --- a/docling/backend/docling_parse_backend.py +++ b/docling/backend/docling_parse_backend.py @@ -132,7 +132,7 @@ def draw_clusters_and_cells(): return cells def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]: - AREA_THRESHOLD = 32 * 32 + AREA_THRESHOLD = 0 # 32 * 32 for i in range(len(self._dpage["images"])): bitmap = self._dpage["images"][i] diff --git a/docling/backend/docling_parse_v2_backend.py b/docling/backend/docling_parse_v2_backend.py index 12d7df55..366fa6ac 100644 --- a/docling/backend/docling_parse_v2_backend.py +++ b/docling/backend/docling_parse_v2_backend.py @@ -140,7 +140,7 @@ def draw_clusters_and_cells(): return cells def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]: - AREA_THRESHOLD = 32 * 32 + AREA_THRESHOLD = 0 # 32 * 32 images = self._dpage["sanitized"]["images"]["data"] images_header = self._dpage["sanitized"]["images"]["header"] diff --git a/docling/backend/pypdfium2_backend.py b/docling/backend/pypdfium2_backend.py index d24ba608..2566fc18 100644 --- a/docling/backend/pypdfium2_backend.py +++ b/docling/backend/pypdfium2_backend.py @@ -39,7 +39,7 @@ def is_valid(self) -> bool: return self.valid def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]: - AREA_THRESHOLD = 32 * 32 + AREA_THRESHOLD = 0 # 32 * 32 for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]): pos = obj.get_pos() cropbox = BoundingBox.from_tuple( diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index fc78e27c..eeec6bab 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -139,7 +139,7 @@ class EasyOcrOptions(OcrOptions): use_gpu: Optional[bool] = None - confidence_threshold: float = 0.65 + confidence_threshold: float = 0.5 model_storage_directory: Optional[str] = None recog_network: Optional[str] = "standard" diff --git a/docling/models/base_ocr_model.py b/docling/models/base_ocr_model.py index 68553c48..9afb7dde 100644 --- a/docling/models/base_ocr_model.py +++ b/docling/models/base_ocr_model.py @@ -8,7 +8,7 @@ from docling_core.types.doc import BoundingBox, CoordOrigin from PIL import Image, ImageDraw from rtree import index -from scipy.ndimage import find_objects, label +from scipy.ndimage import binary_dilation, find_objects, label from docling.datamodel.base_models import Cell, OcrCell, Page from docling.datamodel.document import ConversionResult @@ -43,6 +43,12 @@ def find_ocr_rects(size, bitmap_rects): np_image = np.array(image) + # Dilate the image by 10 pixels to merge nearby bitmap rectangles + structure = np.ones( + (20, 20) + ) # Create a 20x20 structure element (10 pixels in all directions) + np_image = binary_dilation(np_image > 0, structure=structure) + # Find the connected components labeled_image, num_features = label( np_image > 0 @@ -72,7 +78,7 @@ def find_ocr_rects(size, bitmap_rects): bitmap_rects = [] coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects) - # return full-page rectangle if sufficiently covered with bitmaps + # return full-page rectangle if page is dominantly covered with bitmaps if self.options.force_full_page_ocr or coverage > max( BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold ): @@ -85,17 +91,11 @@ def find_ocr_rects(size, bitmap_rects): coord_origin=CoordOrigin.TOPLEFT, ) ] - # return individual rectangles if the bitmap coverage is smaller - else: # coverage <= BITMAP_COVERAGE_TRESHOLD: - - # skip OCR if the bitmap area on the page is smaller than the options threshold - ocr_rects = [ - rect - for rect in ocr_rects - if rect.area() / (page.size.width * page.size.height) - > self.options.bitmap_area_threshold - ] + # return individual rectangles if the bitmap coverage is above the threshold + elif coverage > self.options.bitmap_area_threshold: return ocr_rects + else: # overall coverage of bitmaps is too low, drop all bitmap rectangles. + return [] # Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell. def _filter_ocr_cells(self, ocr_cells, programmatic_cells): @@ -162,6 +162,9 @@ def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False x0 *= scale_x x1 *= scale_x + if y1 <= y0: + y1, y0 = y0, y1 + color = "gray" if isinstance(tc, OcrCell): color = "magenta"