Skip to content

Commit

Permalink
More fixes to the font-understanding
Browse files Browse the repository at this point in the history
Signed-off-by: Peter Staar <[email protected]>
  • Loading branch information
PeterStaar-IBM committed Jan 23, 2025
1 parent 1d2eece commit 9d71602
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 14 deletions.
4 changes: 2 additions & 2 deletions docling_parse/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ class PdfColoredElement(PdfBaseElement):
class PdfCell(PdfColoredElement):

rect: BoundingRectangle

rect_fontbbox: Optional[BoundingRectangle] = None
rect_capheight: Optional[BoundingRectangle] = None

Expand Down Expand Up @@ -366,7 +366,7 @@ def export_to_textlines(
line += f"({cell.rect.r_x1:03.02f}, {cell.rect.r_y1:03.02f}) "
line += f"({cell.rect.r_x2:03.02f}, {cell.rect.r_y2:03.02f}) "
line += f"({cell.rect.r_x3:03.02f}, {cell.rect.r_y3:03.02f}) "

if add_fontkey:
line += f"{cell.font_key} "

Expand Down
35 changes: 23 additions & 12 deletions tests/test_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def verify_bitmap_resources(


def verify_cells(
true_cells: List[PdfCell], pred_cells: List[PdfCell], eps: float, filename: str
true_cells: List[PdfCell], pred_cells: List[PdfCell], eps: float, filename: str
) -> bool:

assert len(true_cells) == len(pred_cells), "len(true_cells)==len(pred_cells)"
Expand All @@ -67,13 +67,16 @@ def verify_cells(
true_cell.ordering == pred_cell.ordering
), "true_cell.ordering == pred_cell.ordering"

assert true_cell.text == pred_cell.text, f"true_cell.text == pred_cell.text => {true_cell.text} == {pred_cell.text} for {filename}"
assert true_cell.orig == pred_cell.orig, f"true_cell.orig == pred_cell.orig => {true_cell.orig} == {pred_cell.orig} for {filename}"

assert (
true_cell.text == pred_cell.text
), f"true_cell.text == pred_cell.text => {true_cell.text} == {pred_cell.text} for {filename}"
assert (
true_cell.orig == pred_cell.orig
), f"true_cell.orig == pred_cell.orig => {true_cell.orig} == {pred_cell.orig} for {filename}"

true_rect = true_cell.rect.to_polygon()
pred_rect = pred_cell.rect.to_polygon()


for l in range(0, 4):
assert (
abs(true_rect[l][0] - pred_rect[l][0]) < eps
Expand All @@ -82,7 +85,7 @@ def verify_cells(
assert (
abs(true_rect[l][1] - pred_rect[l][1]) < eps
), f"abs(true_rect[{l}][1]-pred_rect[{l}][1])<eps -> abs({true_rect[l][1]}-{pred_rect[l][1]})<{eps} for {filename}"

# print("true-text: ", true_cell.text)
# print("pred-text: ", pred_cell.text)

Expand Down Expand Up @@ -162,7 +165,9 @@ def verify_lines(
return True


def verify_SegmentedPdfPage(true_page: SegmentedPdfPage, pred_page: SegmentedPdfPage, filename: str):
def verify_SegmentedPdfPage(
true_page: SegmentedPdfPage, pred_page: SegmentedPdfPage, filename: str
):

eps = max(true_page.dimension.width / 100.0, true_page.dimension.height / 100.0)

Expand All @@ -175,10 +180,12 @@ def verify_SegmentedPdfPage(true_page: SegmentedPdfPage, pred_page: SegmentedPdf
verify_lines(true_page.lines, pred_page.lines, eps=eps)


def verify_ParsedPdfPage(true_page: ParsedPdfPage, pred_page: ParsedPdfPage):
def verify_ParsedPdfPage(
true_page: ParsedPdfPage, pred_page: ParsedPdfPage, filename: str = ""
):

verify_SegmentedPdfPage(true_page.original, pred_page.original)
verify_SegmentedPdfPage(true_page.sanitized, pred_page.sanitized)
verify_SegmentedPdfPage(true_page.original, pred_page.original, filename=filename)
verify_SegmentedPdfPage(true_page.sanitized, pred_page.sanitized, filename=filename)


def test_reference_documents_from_filenames():
Expand Down Expand Up @@ -216,7 +223,9 @@ def test_reference_documents_from_filenames():
print(f"loading from {fname}")

true_page = SegmentedPdfPage.load_from_json(fname)
verify_SegmentedPdfPage(true_page, pred_page.original, filename=pdf_doc_path)
verify_SegmentedPdfPage(
true_page, pred_page.original, filename=pdf_doc_path
)

if True:
rname = os.path.basename(pdf_doc_path)
Expand All @@ -230,7 +239,9 @@ def test_reference_documents_from_filenames():
print(f"loading from {fname}")

true_page = SegmentedPdfPage.load_from_json(fname)
verify_SegmentedPdfPage(true_page, pred_page.sanitized, filename=fname)
verify_SegmentedPdfPage(
true_page, pred_page.sanitized, filename=fname
)

pred_page.original.render()
# res.show()
Expand Down

0 comments on commit 9d71602

Please sign in to comment.