From f4bf3d25b955b71729833a18aa3a5b643fecfa75 Mon Sep 17 00:00:00 2001
From: Maxim Lysak <101627549+maxmnemonic@users.noreply.github.com>
Date: Tue, 30 Jul 2024 14:51:47 +0200
Subject: [PATCH] fix: Correct text extraction for table cells (#21)

* - Fixes for scaling transformation for table cell bounding boxes when using do_cell_matching = False
- Corrected examples/convert.py with appropriate parameter, for good quality example conversion

Signed-off-by: Maxim Lysak <mly@zurich.ibm.com>

* Completed checks

Signed-off-by: Maxim Lysak <mly@zurich.ibm.com>

---------

Signed-off-by: Maxim Lysak <mly@zurich.ibm.com>
Co-authored-by: Maxim Lysak <mly@zurich.ibm.com>
---
 docling/models/table_structure_model.py | 7 +++++--
 examples/convert.py                     | 8 +++++++-
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/docling/models/table_structure_model.py b/docling/models/table_structure_model.py
index 132b141c..09c789d2 100644
--- a/docling/models/table_structure_model.py
+++ b/docling/models/table_structure_model.py
@@ -114,12 +114,15 @@ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
                     for element in table_out["tf_responses"]:
 
                         if not self.do_cell_matching:
-                            the_bbox = BoundingBox.model_validate(element["bbox"])
+                            the_bbox = BoundingBox.model_validate(
+                                element["bbox"]
+                            ).scaled(1 / self.scale)
                             text_piece = page._backend.get_text_in_rect(the_bbox)
                             element["bbox"]["token"] = text_piece
 
                         tc = TableCell.model_validate(element)
-                        tc.bbox = tc.bbox.scaled(1 / self.scale)
+                        if self.do_cell_matching:
+                            tc.bbox = tc.bbox.scaled(1 / self.scale)
                         table_cells.append(tc)
 
                     # Retrieving cols/rows, after post processing:
diff --git a/examples/convert.py b/examples/convert.py
index 26a38c51..f197c20a 100644
--- a/examples/convert.py
+++ b/examples/convert.py
@@ -53,7 +53,13 @@ def main():
 
     artifacts_path = DocumentConverter.download_models_hf()
 
-    doc_converter = DocumentConverter(artifacts_path=artifacts_path)
+    pipeline_options = PipelineOptions(do_table_structure=True)
+    # use text cells predicted from table structure model, instead of matching with pdf cells
+    pipeline_options.table_structure_options.do_cell_matching = False
+
+    doc_converter = DocumentConverter(
+        artifacts_path=artifacts_path, pipeline_options=pipeline_options
+    )
 
     input = DocumentConversionInput.from_paths(input_doc_paths)