diff --git a/docling_parse/document.py b/docling_parse/document.py index e441db8f..ff462818 100644 --- a/docling_parse/document.py +++ b/docling_parse/document.py @@ -153,7 +153,7 @@ class PdfColoredElement(PdfBaseElement): class PdfCell(PdfColoredElement): rect: BoundingRectangle - + rect_fontbbox: Optional[BoundingRectangle] = None rect_capheight: Optional[BoundingRectangle] = None @@ -366,7 +366,7 @@ def export_to_textlines( line += f"({cell.rect.r_x1:03.02f}, {cell.rect.r_y1:03.02f}) " line += f"({cell.rect.r_x2:03.02f}, {cell.rect.r_y2:03.02f}) " line += f"({cell.rect.r_x3:03.02f}, {cell.rect.r_y3:03.02f}) " - + if add_fontkey: line += f"{cell.font_key} " diff --git a/tests/test_parse.py b/tests/test_parse.py index 3a1e4fe8..2ff3e67e 100644 --- a/tests/test_parse.py +++ b/tests/test_parse.py @@ -54,7 +54,7 @@ def verify_bitmap_resources( def verify_cells( - true_cells: List[PdfCell], pred_cells: List[PdfCell], eps: float, filename: str + true_cells: List[PdfCell], pred_cells: List[PdfCell], eps: float, filename: str ) -> bool: assert len(true_cells) == len(pred_cells), "len(true_cells)==len(pred_cells)" @@ -67,13 +67,16 @@ def verify_cells( true_cell.ordering == pred_cell.ordering ), "true_cell.ordering == pred_cell.ordering" - assert true_cell.text == pred_cell.text, f"true_cell.text == pred_cell.text => {true_cell.text} == {pred_cell.text} for {filename}" - assert true_cell.orig == pred_cell.orig, f"true_cell.orig == pred_cell.orig => {true_cell.orig} == {pred_cell.orig} for {filename}" - + assert ( + true_cell.text == pred_cell.text + ), f"true_cell.text == pred_cell.text => {true_cell.text} == {pred_cell.text} for {filename}" + assert ( + true_cell.orig == pred_cell.orig + ), f"true_cell.orig == pred_cell.orig => {true_cell.orig} == {pred_cell.orig} for {filename}" + true_rect = true_cell.rect.to_polygon() pred_rect = pred_cell.rect.to_polygon() - for l in range(0, 4): assert ( abs(true_rect[l][0] - pred_rect[l][0]) < eps @@ -82,7 +85,7 @@ def verify_cells( assert ( abs(true_rect[l][1] - pred_rect[l][1]) < eps ), f"abs(true_rect[{l}][1]-pred_rect[{l}][1]) abs({true_rect[l][1]}-{pred_rect[l][1]})<{eps} for {filename}" - + # print("true-text: ", true_cell.text) # print("pred-text: ", pred_cell.text) @@ -162,7 +165,9 @@ def verify_lines( return True -def verify_SegmentedPdfPage(true_page: SegmentedPdfPage, pred_page: SegmentedPdfPage, filename: str): +def verify_SegmentedPdfPage( + true_page: SegmentedPdfPage, pred_page: SegmentedPdfPage, filename: str +): eps = max(true_page.dimension.width / 100.0, true_page.dimension.height / 100.0) @@ -175,10 +180,12 @@ def verify_SegmentedPdfPage(true_page: SegmentedPdfPage, pred_page: SegmentedPdf verify_lines(true_page.lines, pred_page.lines, eps=eps) -def verify_ParsedPdfPage(true_page: ParsedPdfPage, pred_page: ParsedPdfPage): +def verify_ParsedPdfPage( + true_page: ParsedPdfPage, pred_page: ParsedPdfPage, filename: str = "" +): - verify_SegmentedPdfPage(true_page.original, pred_page.original) - verify_SegmentedPdfPage(true_page.sanitized, pred_page.sanitized) + verify_SegmentedPdfPage(true_page.original, pred_page.original, filename=filename) + verify_SegmentedPdfPage(true_page.sanitized, pred_page.sanitized, filename=filename) def test_reference_documents_from_filenames(): @@ -216,7 +223,9 @@ def test_reference_documents_from_filenames(): print(f"loading from {fname}") true_page = SegmentedPdfPage.load_from_json(fname) - verify_SegmentedPdfPage(true_page, pred_page.original, filename=pdf_doc_path) + verify_SegmentedPdfPage( + true_page, pred_page.original, filename=pdf_doc_path + ) if True: rname = os.path.basename(pdf_doc_path) @@ -230,7 +239,9 @@ def test_reference_documents_from_filenames(): print(f"loading from {fname}") true_page = SegmentedPdfPage.load_from_json(fname) - verify_SegmentedPdfPage(true_page, pred_page.sanitized, filename=fname) + verify_SegmentedPdfPage( + true_page, pred_page.sanitized, filename=fname + ) pred_page.original.render() # res.show()