diff --git a/docling_parse/extract_text_from_bbox.py b/docling_parse/extract_text_from_bbox.py index 8431c82d..39a8f4c4 100644 --- a/docling_parse/extract_text_from_bbox.py +++ b/docling_parse/extract_text_from_bbox.py @@ -9,6 +9,7 @@ create_pil_image_of_page_v2, draw_bbox_on_page_v2, filter_columns_v2, + get_orientation_bbox_v2, ) # Configure logging @@ -55,6 +56,17 @@ def parse_args(): help="bounding box as str x0,y0,x1,y1", ) + # Restrict page-boundary + parser.add_argument( + "-c", + "--category", + type=str, + choices=["original", "sanitized"], + required=False, + default="sanitized", + help="category [`original`, `sanitized`]", + ) + # Parse the command-line arguments args = parser.parse_args() @@ -66,12 +78,13 @@ def parse_args(): args.input_pdf, int(args.page), list(map(float, args.bbox.split(","))), + args.category, ) def main(): - log_level, pdf_file, page_num, bbox = parse_args() + log_level, pdf_file, page_num, bbox, category = parse_args() parser = pdf_parser_v2(log_level) @@ -117,12 +130,17 @@ def main(): logging.info("#-cells: " + str(len(sanitized_cells["data"]))) logging.info(f"selected cells: \n\n{table}\n\n") - img = create_pil_image_of_page_v2(doc["pages"][0]) + img = create_pil_image_of_page_v2(doc["pages"][0], category=category) # img.show() img = draw_bbox_on_page_v2(img, page, list(map(int, bbox))) img.show() + orientation = get_orientation_bbox_v2( + data=sanitized_cells["data"], header=sanitized_cells["header"], bbox=bbox + ) + logging.info(f"orientation: {orientation}") + if __name__ == "__main__": main() diff --git a/docling_parse/utils.py b/docling_parse/utils.py index 0fe12d34..57cf2f18 100644 --- a/docling_parse/utils.py +++ b/docling_parse/utils.py @@ -1,4 +1,5 @@ import logging +from enum import Enum from typing import Dict, List, Optional, Tuple, Union from PIL import Image, ImageColor, ImageDraw, ImageFont @@ -157,6 +158,84 @@ def create_pil_image_of_page_v1( return img +class BBoxDirection(Enum): + Bottom2Top = "Bottom2Top" + Right2Left = "Right2Left" + Top2Bottom = "Top2Bottom" + Left2Right = "Left2Right" + + def rotation_to_bottom2top(direction: "BBoxDirection"): + + if direction == BBoxDirection.Bottom2Top: + return 0.0 + elif direction == BBoxDirection.Right2Left: + return 90.0 + elif direction == BBoxDirection.Top2Bottom: + return 180.0 + elif direction == BBoxDirection.Left2Right: + return -90.0 + + +def get_orientation_bbox_v2( + data: List[Tuple], header: list[str], bbox: Tuple[float, float, float, float] +) -> BBoxDirection: + + x0 = header.index("x0") + y0 = header.index("y0") + + x1 = header.index("x1") + y1 = header.index("y1") + + r_x0 = header.index("r_x0") + r_y0 = header.index("r_y0") + + header.index("r_x1") + header.index("r_y1") + + r_x2 = header.index("r_x2") + r_y2 = header.index("r_y2") + + header.index("r_x3") + header.index("r_y3") + + ti = header.index("text") + + hist = {} + for direction in BBoxDirection: + hist[direction] = 0 + + for row in data: + + if ( + bbox[0] <= row[x0] + and row[x1] <= bbox[2] + and bbox[1] <= row[y0] + and row[y1] <= bbox[3] + ): + + if row[r_x0] < row[r_x2] and row[r_y0] < row[r_y2]: + hist[BBoxDirection.Bottom2Top] += len(row[ti]) + + elif row[r_x2] < row[r_x0] and row[r_y0] < row[r_y2]: + hist[BBoxDirection.Right2Left] += len(row[ti]) + + elif row[r_x2] < row[r_x0] and row[r_y2] < row[r_y0]: + hist[BBoxDirection.Top2Bottom] += len(row[ti]) + + elif row[r_x0] < row[r_x2] and row[r_y2] < row[r_y0]: + hist[BBoxDirection.Right2Left] += len(row[ti]) + + max_dir = BBoxDirection.Bottom2Top + max_val = 0 + for key, val in hist.items(): + logging.info(f"{key}: {val}") + if val > max_val: + max_val = val + max_dir = key + + return max_dir + + def filter_columns_v2(data: List[Tuple], header: list[str], new_header: list[str]): new_data = []