diff --git a/docling/models/ds_glm_model.py b/docling/models/ds_glm_model.py index bf7d808a..74b3ddb0 100644 --- a/docling/models/ds_glm_model.py +++ b/docling/models/ds_glm_model.py @@ -202,6 +202,7 @@ def make_spans(cell): page_dimensions = [ PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width) for p in conv_res.pages + if p.size is not None ] ds_doc: DsDocument = DsDocument( diff --git a/docling/models/easyocr_model.py b/docling/models/easyocr_model.py index 0e02f11e..d535b593 100644 --- a/docling/models/easyocr_model.py +++ b/docling/models/easyocr_model.py @@ -41,48 +41,50 @@ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: for page in page_batch: assert page._backend is not None - - ocr_rects = self.get_ocr_rects(page) - - all_ocr_cells = [] - for ocr_rect in ocr_rects: - # Skip zero area boxes - if ocr_rect.area() == 0: - continue - high_res_image = page._backend.get_page_image( - scale=self.scale, cropbox=ocr_rect - ) - im = numpy.array(high_res_image) - result = self.reader.readtext(im) - - del high_res_image - del im - - cells = [ - OcrCell( - id=ix, - text=line[1], - confidence=line[2], - bbox=BoundingBox.from_tuple( - coord=( - (line[0][0][0] / self.scale) + ocr_rect.l, - (line[0][0][1] / self.scale) + ocr_rect.t, - (line[0][2][0] / self.scale) + ocr_rect.l, - (line[0][2][1] / self.scale) + ocr_rect.t, - ), - origin=CoordOrigin.TOPLEFT, - ), + if not page._backend.is_valid(): + yield page + else: + ocr_rects = self.get_ocr_rects(page) + + all_ocr_cells = [] + for ocr_rect in ocr_rects: + # Skip zero area boxes + if ocr_rect.area() == 0: + continue + high_res_image = page._backend.get_page_image( + scale=self.scale, cropbox=ocr_rect ) - for ix, line in enumerate(result) - ] - all_ocr_cells.extend(cells) + im = numpy.array(high_res_image) + result = self.reader.readtext(im) + + del high_res_image + del im + + cells = [ + OcrCell( + id=ix, + text=line[1], + confidence=line[2], + bbox=BoundingBox.from_tuple( + coord=( + (line[0][0][0] / self.scale) + ocr_rect.l, + (line[0][0][1] / self.scale) + ocr_rect.t, + (line[0][2][0] / self.scale) + ocr_rect.l, + (line[0][2][1] / self.scale) + ocr_rect.t, + ), + origin=CoordOrigin.TOPLEFT, + ), + ) + for ix, line in enumerate(result) + ] + all_ocr_cells.extend(cells) - ## Remove OCR cells which overlap with programmatic cells. - filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells) + ## Remove OCR cells which overlap with programmatic cells. + filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells) - page.cells.extend(filtered_ocr_cells) + page.cells.extend(filtered_ocr_cells) - # DEBUG code: - # self.draw_ocr_rects_and_cells(page, ocr_rects) + # DEBUG code: + # self.draw_ocr_rects_and_cells(page, ocr_rects) - yield page + yield page diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py index de371ee5..009a5b92 100644 --- a/docling/models/layout_model.py +++ b/docling/models/layout_model.py @@ -273,68 +273,72 @@ def postprocess(self, clusters_in: List[Cluster], cells: List[Cell], page_height def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: for page in page_batch: - assert page.size is not None - - clusters = [] - for ix, pred_item in enumerate( - self.layout_predictor.predict(page.get_image(scale=1.0)) - ): - label = DocItemLabel( - pred_item["label"].lower().replace(" ", "_").replace("-", "_") - ) # Temporary, until docling-ibm-model uses docling-core types - cluster = Cluster( - id=ix, - label=label, - confidence=pred_item["confidence"], - bbox=BoundingBox.model_validate(pred_item), - cells=[], - ) - clusters.append(cluster) - - # Map cells to clusters - # TODO: Remove, postprocess should take care of it anyway. - for cell in page.cells: - for cluster in clusters: - if not cell.bbox.area() > 0: - overlap_frac = 0.0 - else: - overlap_frac = ( - cell.bbox.intersection_area_with(cluster.bbox) - / cell.bbox.area() - ) - - if overlap_frac > 0.5: - cluster.cells.append(cell) - - # Pre-sort clusters - # clusters = self.sort_clusters_by_cell_order(clusters) - - # DEBUG code: - def draw_clusters_and_cells(): - image = copy.deepcopy(page.image) - draw = ImageDraw.Draw(image) - for c in clusters: - x0, y0, x1, y1 = c.bbox.as_tuple() - draw.rectangle([(x0, y0), (x1, y1)], outline="green") - - cell_color = ( - random.randint(30, 140), - random.randint(30, 140), - random.randint(30, 140), + assert page._backend is not None + if not page._backend.is_valid(): + yield page + else: + assert page.size is not None + + clusters = [] + for ix, pred_item in enumerate( + self.layout_predictor.predict(page.get_image(scale=1.0)) + ): + label = DocItemLabel( + pred_item["label"].lower().replace(" ", "_").replace("-", "_") + ) # Temporary, until docling-ibm-model uses docling-core types + cluster = Cluster( + id=ix, + label=label, + confidence=pred_item["confidence"], + bbox=BoundingBox.model_validate(pred_item), + cells=[], ) - for tc in c.cells: # [:1]: - x0, y0, x1, y1 = tc.bbox.as_tuple() - draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color) - image.show() + clusters.append(cluster) + + # Map cells to clusters + # TODO: Remove, postprocess should take care of it anyway. + for cell in page.cells: + for cluster in clusters: + if not cell.bbox.area() > 0: + overlap_frac = 0.0 + else: + overlap_frac = ( + cell.bbox.intersection_area_with(cluster.bbox) + / cell.bbox.area() + ) + + if overlap_frac > 0.5: + cluster.cells.append(cell) + + # Pre-sort clusters + # clusters = self.sort_clusters_by_cell_order(clusters) + + # DEBUG code: + def draw_clusters_and_cells(): + image = copy.deepcopy(page.image) + draw = ImageDraw.Draw(image) + for c in clusters: + x0, y0, x1, y1 = c.bbox.as_tuple() + draw.rectangle([(x0, y0), (x1, y1)], outline="green") + + cell_color = ( + random.randint(30, 140), + random.randint(30, 140), + random.randint(30, 140), + ) + for tc in c.cells: # [:1]: + x0, y0, x1, y1 = tc.bbox.as_tuple() + draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color) + image.show() - # draw_clusters_and_cells() + # draw_clusters_and_cells() - clusters, page.cells = self.postprocess( - clusters, page.cells, page.size.height - ) + clusters, page.cells = self.postprocess( + clusters, page.cells, page.size.height + ) - # draw_clusters_and_cells() + # draw_clusters_and_cells() - page.predictions.layout = LayoutPrediction(clusters=clusters) + page.predictions.layout = LayoutPrediction(clusters=clusters) - yield page + yield page diff --git a/docling/models/page_assemble_model.py b/docling/models/page_assemble_model.py index 6bd55bf0..caf168cc 100644 --- a/docling/models/page_assemble_model.py +++ b/docling/models/page_assemble_model.py @@ -54,111 +54,119 @@ def sanitize_text(self, lines): def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: for page in page_batch: assert page._backend is not None - assert page.predictions.layout is not None - # assembles some JSON output page by page. - - elements: List[PageElement] = [] - headers: List[PageElement] = [] - body: List[PageElement] = [] - - for cluster in page.predictions.layout.clusters: - # _log.info("Cluster label seen:", cluster.label) - if cluster.label in LayoutModel.TEXT_ELEM_LABELS: - - textlines = [ - cell.text.replace("\x02", "-").strip() - for cell in cluster.cells - if len(cell.text.strip()) > 0 - ] - text = self.sanitize_text(textlines) - text_el = TextElement( - label=cluster.label, - id=cluster.id, - text=text, - page_no=page.page_no, - cluster=cluster, - ) - elements.append(text_el) - - if cluster.label in LayoutModel.PAGE_HEADER_LABELS: - headers.append(text_el) - else: - body.append(text_el) - elif cluster.label == LayoutModel.TABLE_LABEL: - tbl = None - if page.predictions.tablestructure: - tbl = page.predictions.tablestructure.table_map.get( - cluster.id, None - ) - if ( - not tbl - ): # fallback: add table without structure, if it isn't present - tbl = Table( - label=cluster.label, - id=cluster.id, - text="", - otsl_seq=[], - table_cells=[], - cluster=cluster, - page_no=page.page_no, - ) + if not page._backend.is_valid(): + yield page + else: + assert page.predictions.layout is not None - elements.append(tbl) - body.append(tbl) - elif cluster.label == LayoutModel.FIGURE_LABEL: - fig = None - if page.predictions.figures_classification: - fig = page.predictions.figures_classification.figure_map.get( - cluster.id, None - ) - if ( - not fig - ): # fallback: add figure without classification, if it isn't present - fig = FigureElement( + # assembles some JSON output page by page. + + elements: List[PageElement] = [] + headers: List[PageElement] = [] + body: List[PageElement] = [] + + for cluster in page.predictions.layout.clusters: + # _log.info("Cluster label seen:", cluster.label) + if cluster.label in LayoutModel.TEXT_ELEM_LABELS: + + textlines = [ + cell.text.replace("\x02", "-").strip() + for cell in cluster.cells + if len(cell.text.strip()) > 0 + ] + text = self.sanitize_text(textlines) + text_el = TextElement( label=cluster.label, id=cluster.id, - text="", - data=None, - cluster=cluster, + text=text, page_no=page.page_no, + cluster=cluster, ) - elements.append(fig) - body.append(fig) - elif cluster.label == LayoutModel.FORMULA_LABEL: - equation = None - if page.predictions.equations_prediction: - equation = ( - page.predictions.equations_prediction.equation_map.get( + elements.append(text_el) + + if cluster.label in LayoutModel.PAGE_HEADER_LABELS: + headers.append(text_el) + else: + body.append(text_el) + elif cluster.label == LayoutModel.TABLE_LABEL: + tbl = None + if page.predictions.tablestructure: + tbl = page.predictions.tablestructure.table_map.get( cluster.id, None ) - ) - if not equation: # fallback: add empty formula, if it isn't present - text = self.sanitize_text( - [ - cell.text.replace("\x02", "-").strip() - for cell in cluster.cells - if len(cell.text.strip()) > 0 - ] - ) - equation = TextElement( - label=cluster.label, - id=cluster.id, - cluster=cluster, - page_no=page.page_no, - text=text, - ) - elements.append(equation) - body.append(equation) + if ( + not tbl + ): # fallback: add table without structure, if it isn't present + tbl = Table( + label=cluster.label, + id=cluster.id, + text="", + otsl_seq=[], + table_cells=[], + cluster=cluster, + page_no=page.page_no, + ) + + elements.append(tbl) + body.append(tbl) + elif cluster.label == LayoutModel.FIGURE_LABEL: + fig = None + if page.predictions.figures_classification: + fig = ( + page.predictions.figures_classification.figure_map.get( + cluster.id, None + ) + ) + if ( + not fig + ): # fallback: add figure without classification, if it isn't present + fig = FigureElement( + label=cluster.label, + id=cluster.id, + text="", + data=None, + cluster=cluster, + page_no=page.page_no, + ) + elements.append(fig) + body.append(fig) + elif cluster.label == LayoutModel.FORMULA_LABEL: + equation = None + if page.predictions.equations_prediction: + equation = ( + page.predictions.equations_prediction.equation_map.get( + cluster.id, None + ) + ) + if ( + not equation + ): # fallback: add empty formula, if it isn't present + text = self.sanitize_text( + [ + cell.text.replace("\x02", "-").strip() + for cell in cluster.cells + if len(cell.text.strip()) > 0 + ] + ) + equation = TextElement( + label=cluster.label, + id=cluster.id, + cluster=cluster, + page_no=page.page_no, + text=text, + ) + elements.append(equation) + body.append(equation) - page.assembled = AssembledUnit( - elements=elements, headers=headers, body=body - ) + page.assembled = AssembledUnit( + elements=elements, headers=headers, body=body + ) - # Remove page images (can be disabled) - if not self.options.keep_images: - page._image_cache = {} + # Remove page images (can be disabled) + if not self.options.keep_images: + page._image_cache = {} - # Unload backend - page._backend.unload() + # Unload backend + page._backend.unload() - yield page + yield page diff --git a/docling/models/page_preprocessing_model.py b/docling/models/page_preprocessing_model.py index 7c71fd50..1e0032c1 100644 --- a/docling/models/page_preprocessing_model.py +++ b/docling/models/page_preprocessing_model.py @@ -17,9 +17,13 @@ def __init__(self, options: PagePreprocessingOptions): def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: for page in page_batch: - page = self._populate_page_images(page) - page = self._parse_page_cells(page) - yield page + assert page._backend is not None + if not page._backend.is_valid(): + yield page + else: + page = self._populate_page_images(page) + page = self._parse_page_cells(page) + yield page # Generate the page image and store it in the page object def _populate_page_images(self, page: Page) -> Page: diff --git a/docling/models/table_structure_model.py b/docling/models/table_structure_model.py index 833348e9..a3257ab3 100644 --- a/docling/models/table_structure_model.py +++ b/docling/models/table_structure_model.py @@ -71,92 +71,101 @@ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: for page in page_batch: assert page._backend is not None - assert page.predictions.layout is not None - assert page.size is not None - - page.predictions.tablestructure = TableStructurePrediction() # dummy - - in_tables = [ - ( - cluster, - [ - round(cluster.bbox.l) * self.scale, - round(cluster.bbox.t) * self.scale, - round(cluster.bbox.r) * self.scale, - round(cluster.bbox.b) * self.scale, - ], - ) - for cluster in page.predictions.layout.clusters - if cluster.label == DocItemLabel.TABLE - ] - if not len(in_tables): + if not page._backend.is_valid(): yield page - continue - - tokens = [] - for c in page.cells: - for cluster, _ in in_tables: - if c.bbox.area() > 0: - if ( - c.bbox.intersection_area_with(cluster.bbox) / c.bbox.area() - > 0.2 - ): - # Only allow non empty stings (spaces) into the cells of a table - if len(c.text.strip()) > 0: - new_cell = copy.deepcopy(c) - new_cell.bbox = new_cell.bbox.scaled(scale=self.scale) - - tokens.append(new_cell.model_dump()) - - page_input = { - "tokens": tokens, - "width": page.size.width * self.scale, - "height": page.size.height * self.scale, - } - page_input["image"] = numpy.asarray(page.get_image(scale=self.scale)) - - table_clusters, table_bboxes = zip(*in_tables) - - if len(table_bboxes): - tf_output = self.tf_predictor.multi_table_predict( - page_input, table_bboxes, do_matching=self.do_cell_matching - ) - - for table_cluster, table_out in zip(table_clusters, tf_output): - table_cells = [] - for element in table_out["tf_responses"]: - - if not self.do_cell_matching: - the_bbox = BoundingBox.model_validate( - element["bbox"] - ).scaled(1 / self.scale) - text_piece = page._backend.get_text_in_rect(the_bbox) - element["bbox"]["token"] = text_piece - - tc = TableCell.model_validate(element) - if self.do_cell_matching and tc.bbox is not None: - tc.bbox = tc.bbox.scaled(1 / self.scale) - table_cells.append(tc) - - # Retrieving cols/rows, after post processing: - num_rows = table_out["predict_details"]["num_rows"] - num_cols = table_out["predict_details"]["num_cols"] - otsl_seq = table_out["predict_details"]["prediction"]["rs_seq"] - - tbl = Table( - otsl_seq=otsl_seq, - table_cells=table_cells, - num_rows=num_rows, - num_cols=num_cols, - id=table_cluster.id, - page_no=page.page_no, - cluster=table_cluster, - label=DocItemLabel.TABLE, + else: + + assert page.predictions.layout is not None + assert page.size is not None + + page.predictions.tablestructure = TableStructurePrediction() # dummy + + in_tables = [ + ( + cluster, + [ + round(cluster.bbox.l) * self.scale, + round(cluster.bbox.t) * self.scale, + round(cluster.bbox.r) * self.scale, + round(cluster.bbox.b) * self.scale, + ], + ) + for cluster in page.predictions.layout.clusters + if cluster.label == DocItemLabel.TABLE + ] + if not len(in_tables): + yield page + continue + + tokens = [] + for c in page.cells: + for cluster, _ in in_tables: + if c.bbox.area() > 0: + if ( + c.bbox.intersection_area_with(cluster.bbox) + / c.bbox.area() + > 0.2 + ): + # Only allow non empty stings (spaces) into the cells of a table + if len(c.text.strip()) > 0: + new_cell = copy.deepcopy(c) + new_cell.bbox = new_cell.bbox.scaled( + scale=self.scale + ) + + tokens.append(new_cell.model_dump()) + + page_input = { + "tokens": tokens, + "width": page.size.width * self.scale, + "height": page.size.height * self.scale, + } + page_input["image"] = numpy.asarray(page.get_image(scale=self.scale)) + + table_clusters, table_bboxes = zip(*in_tables) + + if len(table_bboxes): + tf_output = self.tf_predictor.multi_table_predict( + page_input, table_bboxes, do_matching=self.do_cell_matching ) - page.predictions.tablestructure.table_map[table_cluster.id] = tbl - - # For debugging purposes: - # self.draw_table_and_cells(page, page.predictions.tablestructure.table_map.values()) + for table_cluster, table_out in zip(table_clusters, tf_output): + table_cells = [] + for element in table_out["tf_responses"]: + + if not self.do_cell_matching: + the_bbox = BoundingBox.model_validate( + element["bbox"] + ).scaled(1 / self.scale) + text_piece = page._backend.get_text_in_rect(the_bbox) + element["bbox"]["token"] = text_piece + + tc = TableCell.model_validate(element) + if self.do_cell_matching and tc.bbox is not None: + tc.bbox = tc.bbox.scaled(1 / self.scale) + table_cells.append(tc) + + # Retrieving cols/rows, after post processing: + num_rows = table_out["predict_details"]["num_rows"] + num_cols = table_out["predict_details"]["num_cols"] + otsl_seq = table_out["predict_details"]["prediction"]["rs_seq"] + + tbl = Table( + otsl_seq=otsl_seq, + table_cells=table_cells, + num_rows=num_rows, + num_cols=num_cols, + id=table_cluster.id, + page_no=page.page_no, + cluster=table_cluster, + label=DocItemLabel.TABLE, + ) + + page.predictions.tablestructure.table_map[table_cluster.id] = ( + tbl + ) + + # For debugging purposes: + # self.draw_table_and_cells(page, page.predictions.tablestructure.table_map.values()) - yield page + yield page diff --git a/docling/models/tesseract_ocr_cli_model.py b/docling/models/tesseract_ocr_cli_model.py index 2c416d97..b042653b 100644 --- a/docling/models/tesseract_ocr_cli_model.py +++ b/docling/models/tesseract_ocr_cli_model.py @@ -110,61 +110,65 @@ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: for page in page_batch: assert page._backend is not None + if not page._backend.is_valid(): + yield page + else: + ocr_rects = self.get_ocr_rects(page) + + all_ocr_cells = [] + for ocr_rect in ocr_rects: + # Skip zero area boxes + if ocr_rect.area() == 0: + continue + high_res_image = page._backend.get_page_image( + scale=self.scale, cropbox=ocr_rect + ) - ocr_rects = self.get_ocr_rects(page) - - all_ocr_cells = [] - for ocr_rect in ocr_rects: - # Skip zero area boxes - if ocr_rect.area() == 0: - continue - high_res_image = page._backend.get_page_image( - scale=self.scale, cropbox=ocr_rect - ) - - with tempfile.NamedTemporaryFile(suffix=".png", mode="w") as image_file: - fname = image_file.name - high_res_image.save(fname) - - df = self._run_tesseract(fname) - - # _log.info(df) - - # Print relevant columns (bounding box and text) - for ix, row in df.iterrows(): - text = row["text"] - conf = row["conf"] - - l = float(row["left"]) - b = float(row["top"]) - w = float(row["width"]) - h = float(row["height"]) - - t = b + h - r = l + w - - cell = OcrCell( - id=ix, - text=text, - confidence=conf / 100.0, - bbox=BoundingBox.from_tuple( - coord=( - (l / self.scale) + ocr_rect.l, - (b / self.scale) + ocr_rect.t, - (r / self.scale) + ocr_rect.l, - (t / self.scale) + ocr_rect.t, + with tempfile.NamedTemporaryFile( + suffix=".png", mode="w" + ) as image_file: + fname = image_file.name + high_res_image.save(fname) + + df = self._run_tesseract(fname) + + # _log.info(df) + + # Print relevant columns (bounding box and text) + for ix, row in df.iterrows(): + text = row["text"] + conf = row["conf"] + + l = float(row["left"]) + b = float(row["top"]) + w = float(row["width"]) + h = float(row["height"]) + + t = b + h + r = l + w + + cell = OcrCell( + id=ix, + text=text, + confidence=conf / 100.0, + bbox=BoundingBox.from_tuple( + coord=( + (l / self.scale) + ocr_rect.l, + (b / self.scale) + ocr_rect.t, + (r / self.scale) + ocr_rect.l, + (t / self.scale) + ocr_rect.t, + ), + origin=CoordOrigin.TOPLEFT, ), - origin=CoordOrigin.TOPLEFT, - ), - ) - all_ocr_cells.append(cell) + ) + all_ocr_cells.append(cell) - ## Remove OCR cells which overlap with programmatic cells. - filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells) + ## Remove OCR cells which overlap with programmatic cells. + filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells) - page.cells.extend(filtered_ocr_cells) + page.cells.extend(filtered_ocr_cells) - # DEBUG code: - # self.draw_ocr_rects_and_cells(page, ocr_rects) + # DEBUG code: + # self.draw_ocr_rects_and_cells(page, ocr_rects) - yield page + yield page diff --git a/docling/models/tesseract_ocr_model.py b/docling/models/tesseract_ocr_model.py index a97eb9a8..f8a1fe57 100644 --- a/docling/models/tesseract_ocr_model.py +++ b/docling/models/tesseract_ocr_model.py @@ -69,57 +69,62 @@ def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: for page in page_batch: assert page._backend is not None - assert self.reader is not None + if not page._backend.is_valid(): + yield page + else: + assert self.reader is not None - ocr_rects = self.get_ocr_rects(page) + ocr_rects = self.get_ocr_rects(page) - all_ocr_cells = [] - for ocr_rect in ocr_rects: - # Skip zero area boxes - if ocr_rect.area() == 0: - continue - high_res_image = page._backend.get_page_image( - scale=self.scale, cropbox=ocr_rect - ) + all_ocr_cells = [] + for ocr_rect in ocr_rects: + # Skip zero area boxes + if ocr_rect.area() == 0: + continue + high_res_image = page._backend.get_page_image( + scale=self.scale, cropbox=ocr_rect + ) - # Retrieve text snippets with their bounding boxes - self.reader.SetImage(high_res_image) - boxes = self.reader.GetComponentImages(self.reader_RIL.TEXTLINE, True) - - cells = [] - for ix, (im, box, _, _) in enumerate(boxes): - # Set the area of interest. Tesseract uses Bottom-Left for the origin - self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"]) - - # Extract text within the bounding box - text = self.reader.GetUTF8Text().strip() - confidence = self.reader.MeanTextConf() - left = box["x"] / self.scale - bottom = box["y"] / self.scale - right = (box["x"] + box["w"]) / self.scale - top = (box["y"] + box["h"]) / self.scale - - cells.append( - OcrCell( - id=ix, - text=text, - confidence=confidence, - bbox=BoundingBox.from_tuple( - coord=(left, top, right, bottom), - origin=CoordOrigin.TOPLEFT, - ), - ) + # Retrieve text snippets with their bounding boxes + self.reader.SetImage(high_res_image) + boxes = self.reader.GetComponentImages( + self.reader_RIL.TEXTLINE, True ) - # del high_res_image - all_ocr_cells.extend(cells) + cells = [] + for ix, (im, box, _, _) in enumerate(boxes): + # Set the area of interest. Tesseract uses Bottom-Left for the origin + self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"]) + + # Extract text within the bounding box + text = self.reader.GetUTF8Text().strip() + confidence = self.reader.MeanTextConf() + left = box["x"] / self.scale + bottom = box["y"] / self.scale + right = (box["x"] + box["w"]) / self.scale + top = (box["y"] + box["h"]) / self.scale + + cells.append( + OcrCell( + id=ix, + text=text, + confidence=confidence, + bbox=BoundingBox.from_tuple( + coord=(left, top, right, bottom), + origin=CoordOrigin.TOPLEFT, + ), + ) + ) + + # del high_res_image + all_ocr_cells.extend(cells) - ## Remove OCR cells which overlap with programmatic cells. - filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells) + ## Remove OCR cells which overlap with programmatic cells. + filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells) - page.cells.extend(filtered_ocr_cells) + page.cells.extend(filtered_ocr_cells) - # DEBUG code: - # self.draw_ocr_rects_and_cells(page, ocr_rects) + # DEBUG code: + # self.draw_ocr_rects_and_cells(page, ocr_rects) - yield page + yield page diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py index 655caea1..5de2e32f 100644 --- a/docling/pipeline/standard_pdf_pipeline.py +++ b/docling/pipeline/standard_pdf_pipeline.py @@ -134,13 +134,13 @@ def _assemble_document( all_body = [] for p in conv_res.pages: - assert p.assembled is not None - for el in p.assembled.body: - all_body.append(el) - for el in p.assembled.headers: - all_headers.append(el) - for el in p.assembled.elements: - all_elements.append(el) + if p.assembled is not None: + for el in p.assembled.body: + all_body.append(el) + for el in p.assembled.headers: + all_headers.append(el) + for el in p.assembled.elements: + all_elements.append(el) conv_res.assembled = AssembledUnit( elements=all_elements, headers=all_headers, body=all_body diff --git a/docs/v2.md b/docs/v2.md index 319679d6..1c6ee6d9 100644 --- a/docs/v2.md +++ b/docs/v2.md @@ -126,7 +126,7 @@ input_files = [ ] # Directly pass list of files or streams to `convert_all` -conv_results_iter = doc_converter.convert_all(input_files) # previously `convert_batch` +conv_results_iter = doc_converter.convert_all(input_files) # previously `convert` ``` Through the `raises_on_error` argument, you can also control if the conversion should raise exceptions when first @@ -135,7 +135,7 @@ By default, any error is immediately raised and the conversion aborts (previousl ```python ... -conv_results_iter = doc_converter.convert_all(input_files, raises_on_error=False) # previously `convert_batch` +conv_results_iter = doc_converter.convert_all(input_files, raises_on_error=False) # previously `convert` ```