From f4bf3d25b955b71729833a18aa3a5b643fecfa75 Mon Sep 17 00:00:00 2001 From: Maxim Lysak <101627549+maxmnemonic@users.noreply.github.com> Date: Tue, 30 Jul 2024 14:51:47 +0200 Subject: [PATCH] fix: Correct text extraction for table cells (#21) * - Fixes for scaling transformation for table cell bounding boxes when using do_cell_matching = False - Corrected examples/convert.py with appropriate parameter, for good quality example conversion Signed-off-by: Maxim Lysak * Completed checks Signed-off-by: Maxim Lysak --------- Signed-off-by: Maxim Lysak Co-authored-by: Maxim Lysak --- docling/models/table_structure_model.py | 7 +++++-- examples/convert.py | 8 +++++++- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/docling/models/table_structure_model.py b/docling/models/table_structure_model.py index 132b141c..09c789d2 100644 --- a/docling/models/table_structure_model.py +++ b/docling/models/table_structure_model.py @@ -114,12 +114,15 @@ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: for element in table_out["tf_responses"]: if not self.do_cell_matching: - the_bbox = BoundingBox.model_validate(element["bbox"]) + the_bbox = BoundingBox.model_validate( + element["bbox"] + ).scaled(1 / self.scale) text_piece = page._backend.get_text_in_rect(the_bbox) element["bbox"]["token"] = text_piece tc = TableCell.model_validate(element) - tc.bbox = tc.bbox.scaled(1 / self.scale) + if self.do_cell_matching: + tc.bbox = tc.bbox.scaled(1 / self.scale) table_cells.append(tc) # Retrieving cols/rows, after post processing: diff --git a/examples/convert.py b/examples/convert.py index 26a38c51..f197c20a 100644 --- a/examples/convert.py +++ b/examples/convert.py @@ -53,7 +53,13 @@ def main(): artifacts_path = DocumentConverter.download_models_hf() - doc_converter = DocumentConverter(artifacts_path=artifacts_path) + pipeline_options = PipelineOptions(do_table_structure=True) + # use text cells predicted from table structure model, instead of matching with pdf cells + pipeline_options.table_structure_options.do_cell_matching = False + + doc_converter = DocumentConverter( + artifacts_path=artifacts_path, pipeline_options=pipeline_options + ) input = DocumentConversionInput.from_paths(input_doc_paths)