Skip to content

Commit

Permalink
feat(OCR): Introduce the OcrOptions.force_full_page_ocr parameter tha…
Browse files Browse the repository at this point in the history
…t forces a full page OCR scanning (#290)

- When the OCR is forced, any existing PDF cells are rejected.
- Introduce the force-ocr cmd parameter in docling CLI.
- Update unit tests.
- Add the full_page_ocr.py example in mkdocs.

Signed-off-by: Nikos Livathinos <[email protected]>
nikos-livathinos authored Nov 12, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
1 parent 81c8243 commit c6b3763
Showing 10 changed files with 100 additions and 62 deletions.
13 changes: 10 additions & 3 deletions docling/cli/main.py
Original file line number Diff line number Diff line change
@@ -153,6 +153,13 @@ def convert(
..., help="If enabled, the bitmap content will be processed using OCR."
),
] = True,
force_ocr: Annotated[
bool,
typer.Option(
...,
help="Replace any existing text with OCR generated text over the full content.",
),
] = False,
ocr_engine: Annotated[
OcrEngine, typer.Option(..., help="The OCR engine to use.")
] = OcrEngine.EASYOCR,
@@ -219,11 +226,11 @@ def convert(

match ocr_engine:
case OcrEngine.EASYOCR:
ocr_options: OcrOptions = EasyOcrOptions()
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
case OcrEngine.TESSERACT_CLI:
ocr_options = TesseractCliOcrOptions()
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
case OcrEngine.TESSERACT:
ocr_options = TesseractOcrOptions()
ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
case _:
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")

1 change: 1 addition & 0 deletions docling/datamodel/pipeline_options.py
Original file line number Diff line number Diff line change
@@ -22,6 +22,7 @@ class TableStructureOptions(BaseModel):

class OcrOptions(BaseModel):
kind: str
force_full_page_ocr: bool = False # If enabled a full page OCR is always applied
bitmap_area_threshold: float = (
0.05 # percentage of the area for a bitmap to processed with OCR
)
25 changes: 22 additions & 3 deletions docling/models/base_ocr_model.py
Original file line number Diff line number Diff line change
@@ -10,7 +10,7 @@
from rtree import index
from scipy.ndimage import find_objects, label

from docling.datamodel.base_models import OcrCell, Page
from docling.datamodel.base_models import Cell, OcrCell, Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import OcrOptions
from docling.datamodel.settings import settings
@@ -73,7 +73,9 @@ def find_ocr_rects(size, bitmap_rects):
coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)

# return full-page rectangle if sufficiently covered with bitmaps
if coverage > max(BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold):
if self.options.force_full_page_ocr or coverage > max(
BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold
):
return [
BoundingBox(
l=0,
@@ -96,7 +98,7 @@ def find_ocr_rects(size, bitmap_rects):
return ocr_rects

# Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
def filter_ocr_cells(self, ocr_cells, programmatic_cells):
def _filter_ocr_cells(self, ocr_cells, programmatic_cells):
# Create R-tree index for programmatic cells
p = index.Property()
p.dimension = 2
@@ -117,6 +119,23 @@ def is_overlapping_with_existing_cells(ocr_cell):
]
return filtered_ocr_cells

def post_process_cells(self, ocr_cells, programmatic_cells):
r"""
Post-process the ocr and programmatic cells and return the final list of of cells
"""
if self.options.force_full_page_ocr:
# If a full page OCR is forced, use only the OCR cells
cells = [
Cell(id=c_ocr.id, text=c_ocr.text, bbox=c_ocr.bbox)
for c_ocr in ocr_cells
]
return cells

## Remove OCR cells which overlap with programmatic cells.
filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, programmatic_cells)
programmatic_cells.extend(filtered_ocr_cells)
return programmatic_cells

def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
image = copy.deepcopy(page.image)
draw = ImageDraw.Draw(image, "RGBA")
10 changes: 3 additions & 7 deletions docling/models/easyocr_model.py
Original file line number Diff line number Diff line change
@@ -5,7 +5,7 @@
import torch
from docling_core.types.doc import BoundingBox, CoordOrigin

from docling.datamodel.base_models import OcrCell, Page
from docling.datamodel.base_models import Cell, OcrCell, Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import EasyOcrOptions
from docling.datamodel.settings import settings
@@ -88,12 +88,8 @@ def __call__(
]
all_ocr_cells.extend(cells)

## Remove OCR cells which overlap with programmatic cells.
filtered_ocr_cells = self.filter_ocr_cells(
all_ocr_cells, page.cells
)

page.cells.extend(filtered_ocr_cells)
# Post-process the cells
page.cells = self.post_process_cells(all_ocr_cells, page.cells)

# DEBUG code:
if settings.debug.visualize_ocr:
10 changes: 3 additions & 7 deletions docling/models/tesseract_ocr_cli_model.py
Original file line number Diff line number Diff line change
@@ -7,7 +7,7 @@
import pandas as pd
from docling_core.types.doc import BoundingBox, CoordOrigin

from docling.datamodel.base_models import OcrCell, Page
from docling.datamodel.base_models import Cell, OcrCell, Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import TesseractCliOcrOptions
from docling.datamodel.settings import settings
@@ -170,12 +170,8 @@ def __call__(
)
all_ocr_cells.append(cell)

## Remove OCR cells which overlap with programmatic cells.
filtered_ocr_cells = self.filter_ocr_cells(
all_ocr_cells, page.cells
)

page.cells.extend(filtered_ocr_cells)
# Post-process the cells
page.cells = self.post_process_cells(all_ocr_cells, page.cells)

# DEBUG code:
if settings.debug.visualize_ocr:
10 changes: 3 additions & 7 deletions docling/models/tesseract_ocr_model.py
Original file line number Diff line number Diff line change
@@ -3,7 +3,7 @@

from docling_core.types.doc import BoundingBox, CoordOrigin

from docling.datamodel.base_models import OcrCell, Page
from docling.datamodel.base_models import Cell, OcrCell, Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import TesseractOcrOptions
from docling.datamodel.settings import settings
@@ -140,12 +140,8 @@ def __call__(
# del high_res_image
all_ocr_cells.extend(cells)

## Remove OCR cells which overlap with programmatic cells.
filtered_ocr_cells = self.filter_ocr_cells(
all_ocr_cells, page.cells
)

page.cells.extend(filtered_ocr_cells)
# Post-process the cells
page.cells = self.post_process_cells(all_ocr_cells, page.cells)

# DEBUG code:
if settings.debug.visualize_ocr:
42 changes: 42 additions & 0 deletions docs/examples/full_page_ocr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from pathlib import Path

from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
EasyOcrOptions,
PdfPipelineOptions,
TesseractCliOcrOptions,
TesseractOcrOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption


def main():
input_doc = Path("./tests/data/2206.01062.pdf")

pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True

# Any of the OCR options can be used:EasyOcrOptions, TesseractOcrOptions, TesseractCliOcrOptions
# ocr_options = EasyOcrOptions(force_full_page_ocr=True)
# ocr_options = TesseractOcrOptions(force_full_page_ocr=True)
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True)
pipeline_options.ocr_options = ocr_options

converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
)
}
)

doc = converter.convert(input_doc).document
md = doc.export_to_markdown()
print(md)


if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
@@ -71,6 +71,7 @@ nav:
- "Figure enrichment": examples/develop_picture_enrichment.py
- "Table export": examples/export_tables.py
- "Multimodal export": examples/export_multimodal.py
- "Force full page OCR": examples/full_page_ocr.py
- RAG / QA:
- "RAG with LlamaIndex 🦙": examples/rag_llamaindex.ipynb
- "RAG with LangChain 🦜🔗": examples/rag_langchain.ipynb
42 changes: 7 additions & 35 deletions tests/test_e2e_ocr_conversion.py
Original file line number Diff line number Diff line change
@@ -15,34 +15,8 @@

from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2

GENERATE = False


# Debug
def save_output(pdf_path: Path, doc_result: ConversionResult, engine: str):
r""" """
import json
import os

parent = pdf_path.parent
eng = "" if engine is None else f".{engine}"

dict_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.json")
with open(dict_fn, "w") as fd:
json.dump(doc_result.legacy_document.export_to_dict(), fd)

pages_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.pages.json")
pages = [p.model_dump() for p in doc_result.pages]
with open(pages_fn, "w") as fd:
json.dump(pages, fd)

doctags_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.doctags.txt")
with open(doctags_fn, "w") as fd:
fd.write(doc_result.legacy_document.export_to_doctags())

md_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.md")
with open(md_fn, "w") as fd:
fd.write(doc_result.legacy_document.export_to_markdown())
GENERATE_V1 = False
GENERATE_V2 = False


def get_pdf_paths():
@@ -74,13 +48,15 @@ def get_converter(ocr_options: OcrOptions):


def test_e2e_conversions():

pdf_paths = get_pdf_paths()

engines: List[OcrOptions] = [
EasyOcrOptions(),
TesseractOcrOptions(),
TesseractCliOcrOptions(),
EasyOcrOptions(force_full_page_ocr=True),
TesseractOcrOptions(force_full_page_ocr=True),
TesseractCliOcrOptions(force_full_page_ocr=True),
]

for ocr_options in engines:
@@ -91,20 +67,16 @@ def test_e2e_conversions():

doc_result: ConversionResult = converter.convert(pdf_path)

# Save conversions
# save_output(pdf_path, doc_result, None)

# Debug
verify_conversion_result_v1(
input_path=pdf_path,
doc_result=doc_result,
generate=GENERATE,
generate=GENERATE_V1,
fuzzy=True,
)

verify_conversion_result_v2(
input_path=pdf_path,
doc_result=doc_result,
generate=GENERATE,
generate=GENERATE_V2,
fuzzy=True,
)
8 changes: 8 additions & 0 deletions tests/verify_utils.py
Original file line number Diff line number Diff line change
@@ -256,15 +256,19 @@ def verify_conversion_result_v1(
dt_path = gt_subpath.with_suffix(f"{engine_suffix}.doctags.txt")

if generate: # only used when re-generating truth
pages_path.parent.mkdir(parents=True, exist_ok=True)
with open(pages_path, "w") as fw:
fw.write(json.dumps(doc_pred_pages, default=pydantic_encoder))

json_path.parent.mkdir(parents=True, exist_ok=True)
with open(json_path, "w") as fw:
fw.write(json.dumps(doc_pred, default=pydantic_encoder))

md_path.parent.mkdir(parents=True, exist_ok=True)
with open(md_path, "w") as fw:
fw.write(doc_pred_md)

dt_path.parent.mkdir(parents=True, exist_ok=True)
with open(dt_path, "w") as fw:
fw.write(doc_pred_dt)
else: # default branch in test
@@ -328,15 +332,19 @@ def verify_conversion_result_v2(
dt_path = gt_subpath.with_suffix(f"{engine_suffix}.doctags.txt")

if generate: # only used when re-generating truth
pages_path.parent.mkdir(parents=True, exist_ok=True)
with open(pages_path, "w") as fw:
fw.write(json.dumps(doc_pred_pages, default=pydantic_encoder))

json_path.parent.mkdir(parents=True, exist_ok=True)
with open(json_path, "w") as fw:
fw.write(json.dumps(doc_pred, default=pydantic_encoder))

md_path.parent.mkdir(parents=True, exist_ok=True)
with open(md_path, "w") as fw:
fw.write(doc_pred_md)

dt_path.parent.mkdir(parents=True, exist_ok=True)
with open(dt_path, "w") as fw:
fw.write(doc_pred_dt)
else: # default branch in test

0 comments on commit c6b3763

Please sign in to comment.