From 654185dadb35998d358fe17896d71c5ee3fc581b Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Mon, 13 Jan 2025 17:59:27 +0100 Subject: [PATCH] reformatted the pdf_parser Signed-off-by: Peter Staar --- docling_parse/document.py | 236 +++++------------------------------- docling_parse/pdf_parser.py | 147 +++++++++++----------- tests/test_parse.py | 11 +- tests/test_parse_v2.py | 74 ----------- 4 files changed, 106 insertions(+), 362 deletions(-) diff --git a/docling_parse/document.py b/docling_parse/document.py index 2155e7f2..f952d41a 100644 --- a/docling_parse/document.py +++ b/docling_parse/document.py @@ -2,7 +2,7 @@ import logging from enum import Enum -from typing import Dict, List, Optional, Tuple +from typing import Dict, Generator, List, Optional, Tuple from docling_core.types.doc.base import BoundingBox, CoordOrigin from PIL import Image as PILImage @@ -112,7 +112,8 @@ class PageCell(BaseModel): widget: bool # FIXME: could use something more sofisticated? - rgba: Tuple[int, int, int, int] = [0, 0, 0, 255] + rgba: Tuple[int, int, int, int] = (0, 0, 0, 255) + class PageImage(BaseModel): @@ -122,9 +123,9 @@ class PageImage(BaseModel): class PageLine(BaseModel): - #i: List[int] - #x: List[float] - #y: List[float] + # i: List[int] + # x: List[float] + # y: List[float] line_parent_id: int points: List[Tuple[float, float]] @@ -132,18 +133,20 @@ class PageLine(BaseModel): coord_origin: CoordOrigin = CoordOrigin.BOTTOMLEFT # FIXME: could use something more sofisticated? - rgba: Tuple[int, int, int, int] = [0, 0, 0, 255] + rgba: Tuple[int, int, int, int] = (0, 0, 0, 255) width: float = 1.0 def __len__(self) -> int: return len(self.points) - - def iterate_segments(self) -> Tuple[Tuple[float, float], Tuple[float, float]]: - for k in range(0, len(self.points)-1): - yield (self.points[k], self.points[k+1]) + def iterate_segments( + self, + ) -> Generator[Tuple[Tuple[float, float], Tuple[float, float]]]: - def to_bottom_left_origin(self, page_height: float) -> "BoundingRectangle": + for k in range(0, len(self.points) - 1): + yield (self.points[k], self.points[k + 1]) + + def to_bottom_left_origin(self, page_height: float): """to_bottom_left_origin. :param page_height: @@ -153,11 +156,11 @@ def to_bottom_left_origin(self, page_height: float) -> "BoundingRectangle": return self elif self.coord_origin == CoordOrigin.TOPLEFT: for i, point in enumerate(self.points): - self.points[i] = (point[0], page_height-point[1]) + self.points[i] = (point[0], page_height - point[1]) - self.coord_origin = CoordOrigin.BOTTOMLEFT + self.coord_origin = CoordOrigin.BOTTOMLEFT - def to_top_left_origin(self, page_height: float) -> "BoundingRectangle": + def to_top_left_origin(self, page_height: float): """to_top_left_origin. :param page_height: @@ -167,10 +170,11 @@ def to_top_left_origin(self, page_height: float) -> "BoundingRectangle": return self elif self.coord_origin == CoordOrigin.BOTTOMLEFT: for i, point in enumerate(self.points): - self.points[i] = (point[0], page_height-point[1]) + self.points[i] = (point[0], page_height - point[1]) self.coord_origin = CoordOrigin.TOPLEFT - + + class PageBoundaryLabel(str, Enum): """PageBoundaryLabel.""" @@ -233,8 +237,8 @@ def render( page_boundary: PageBoundaryLabel = PageBoundaryLabel.CROP, # media_box draw_cells_bbox: bool = True, draw_cells_text: bool = False, - draw_cells_bl: bool = False, - draw_cells_tr: bool = False, + draw_cells_bl: bool = True, + draw_cells_tr: bool = True, cell_outline: str = "black", cell_color: str = "blue", cell_alpha: float = 1.0, @@ -317,8 +321,8 @@ def _get_rgba(name: str, alpha: float): logging.warning("implement draw_cells_text") if draw_cells_bl: - fill = (_get_rgba(name=cell_bl_color, alpha=cell_bl_alpha),) - outline = (_get_rgba(name=cell_bl_outline, alpha=cell_bl_alpha),) + fill = _get_rgba(name=cell_bl_color, alpha=cell_bl_alpha) + outline = _get_rgba(name=cell_bl_outline, alpha=cell_bl_alpha) # Define the bounding box for the dot dot_bbox = [ @@ -330,8 +334,8 @@ def _get_rgba(name: str, alpha: float): draw.ellipse(dot_bbox, fill=fill, outline=outline) if draw_cells_tr: - fill = (_get_rgba(name=cell_tr_color, alpha=cell_tr_alpha),) - outline = (_get_rgba(name=cell_tr_outline, alpha=cell_tr_alpha),) + fill = _get_rgba(name=cell_tr_color, alpha=cell_tr_alpha) + outline = _get_rgba(name=cell_tr_outline, alpha=cell_tr_alpha) # Define the bounding box for the dot dot_bbox = [ @@ -348,14 +352,14 @@ def _get_rgba(name: str, alpha: float): # Draw each rectangle by connecting its four points for line in self.lines: - line.to_top_left_origin(page_height=H) - for segment in line.iterate_segments(): + line.to_top_left_origin(page_height=H) + for segment in line.iterate_segments(): draw.line( (segment[0][0], segment[0][1], segment[1][0], segment[1][1]), fill=fill, width=line_width, - ) - + ) + return result @@ -379,183 +383,3 @@ class ParsedPage(BaseModel): class ParsedPaginatedDocument(BaseModel): pages: Dict[int, ParsedPage] = {} - - -def _to_dimension(dimension: dict) -> PageDimension: - - page_boundary: PageBoundaryLabel = PageBoundaryLabel(dimension["page_boundary"]) - - bbox = BoundingBox( - l=dimension["bbox"][0], - b=dimension["bbox"][1], - r=dimension["bbox"][2], - t=dimension["bbox"][3], - coord_origin=CoordOrigin.BOTTOMLEFT, - ) - - rect = BoundingRectangle( - r_x0=bbox.l, - r_y0=bbox.b, - r_x1=bbox.r, - r_y1=bbox.b, - r_x2=bbox.r, - r_y2=bbox.t, - r_x3=bbox.l, - r_y3=bbox.t, - coord_origin=CoordOrigin.BOTTOMLEFT, - ) - - art_bbox = BoundingBox( - l=dimension["rectangles"]["art-bbox"][0], - b=dimension["rectangles"]["art-bbox"][1], - r=dimension["rectangles"]["art-bbox"][2], - t=dimension["rectangles"]["art-bbox"][3], - coord_origin=CoordOrigin.BOTTOMLEFT, - ) - - media_bbox = BoundingBox( - l=dimension["rectangles"]["media-bbox"][0], - b=dimension["rectangles"]["media-bbox"][1], - r=dimension["rectangles"]["media-bbox"][2], - t=dimension["rectangles"]["media-bbox"][3], - coord_origin=CoordOrigin.BOTTOMLEFT, - ) - - bleed_bbox = BoundingBox( - l=dimension["rectangles"]["bleed-bbox"][0], - b=dimension["rectangles"]["bleed-bbox"][1], - r=dimension["rectangles"]["bleed-bbox"][2], - t=dimension["rectangles"]["bleed-bbox"][3], - coord_origin=CoordOrigin.BOTTOMLEFT, - ) - - trim_bbox = BoundingBox( - l=dimension["rectangles"]["trim-bbox"][0], - b=dimension["rectangles"]["trim-bbox"][1], - r=dimension["rectangles"]["trim-bbox"][2], - t=dimension["rectangles"]["trim-bbox"][3], - coord_origin=CoordOrigin.BOTTOMLEFT, - ) - - crop_bbox = BoundingBox( - l=dimension["rectangles"]["crop-bbox"][0], - b=dimension["rectangles"]["crop-bbox"][1], - r=dimension["rectangles"]["crop-bbox"][2], - t=dimension["rectangles"]["crop-bbox"][3], - coord_origin=CoordOrigin.BOTTOMLEFT, - ) - - return PageDimension( - angle=dimension["angle"], - page_boundary=dimension["page_boundary"], - bbox=bbox, - rect=rect, - art_bbox=art_bbox, - media_bbox=media_bbox, - trim_bbox=trim_bbox, - crop_bbox=crop_bbox, - bleed_bbox=bleed_bbox, - ) - - -def _to_cells(cells: dict) -> List[PageCell]: - - assert "data" in cells, '"data" in cells' - assert "header" in cells, '"header" in cells' - - data = cells["data"] - header = cells["header"] - - result: List[PageCell] = [] - for ind, row in enumerate(data): - rect = BoundingRectangle( - r_x0=row[header.index(f"r_x0")], - r_y0=row[header.index(f"r_y0")], - r_x1=row[header.index(f"r_x1")], - r_y1=row[header.index(f"r_y1")], - r_x2=row[header.index(f"r_x2")], - r_y2=row[header.index(f"r_y2")], - r_x3=row[header.index(f"r_x3")], - r_y3=row[header.index(f"r_y3")], - ) - cell = PageCell( - rect=rect, - text=row[header.index(f"text")], - orig=row[header.index(f"text")], - font_key=row[header.index(f"font-key")], - font_name=row[header.index(f"font-name")], - widget=row[header.index(f"widget")], - ordering=ind, - rendering_mode="", - ) - result.append(cell) - - return result - - -def _to_images(images: dict) -> List[PageImage]: - - assert "data" in images, '"data" in images' - assert "header" in images, '"header" in images' - - data = images["data"] - header = images["header"] - - result: List[PageImage] = [] - for ind, row in enumerate(data): - rect = BoundingRectangle( - r_x0=row[header.index(f"x0")], - r_y0=row[header.index(f"y0")], - r_x1=row[header.index(f"x1")], - r_y1=row[header.index(f"y0")], - r_x2=row[header.index(f"x1")], - r_y2=row[header.index(f"y1")], - r_x3=row[header.index(f"x0")], - r_y3=row[header.index(f"y1")], - ) - image = PageImage(rect=rect, uri=None) - result.append(image) - - return result - - -def _to_lines(data: dict) -> List[PageLine]: - - result: List[PageLine] = [] - for ind, item in enumerate(data): - - line = PageLine(i=item["i"], x=item["x"], y=item["y"]) - - return result - - -def _to_segmented_page(page: dict) -> SegmentedPage: - - return SegmentedPage( - dimension=_to_dimension(page["dimension"]), - cells=_to_cells(page["cells"]), - images=_to_images(page["images"]), - lines=_to_lines(page["lines"]), - ) - - -def _to_parsed_page(page: dict) -> ParsedPage: - - paginated_page = ParsedPage( - original=_to_segmented_page(page["original"]), - sanitized=_to_segmented_page(page["sanitized"]), - ) - - return paginated_page - - -def from_pdf_parser_v2_to_parsed_paginated_document( - doc_dict: dict, -) -> ParsedPaginatedDocument: - - parsed_doc = ParsedPaginatedDocument() - - for pi, page in enumerate(doc_dict["pages"]): - parsed_doc.pages[pi + 1] = _to_parsed_page(page) - - return parsed_doc diff --git a/docling_parse/pdf_parser.py b/docling_parse/pdf_parser.py index a6d7747f..44e4b42e 100644 --- a/docling_parse/pdf_parser.py +++ b/docling_parse/pdf_parser.py @@ -1,43 +1,27 @@ """Parser for PDF files""" -import io -import json -import os - from io import BytesIO - -import logging -from enum import Enum -from typing import Dict, List, Optional, Tuple +from typing import List, Tuple from docling_core.types.doc.base import BoundingBox, CoordOrigin -from PIL import Image as PILImage -from PIL import ImageColor, ImageDraw -from pydantic import AnyUrl, BaseModel - -from docling_parse.pdf_parsers import pdf_parser_v2 # type: ignore[import] from docling_parse.document import ( BoundingRectangle, - + PageBoundaryLabel, PageCell, + PageDimension, PageImage, PageLine, - - PageBoundaryLabel, - - PageDimension, - SegmentedPage, - ParsedPageLabel, - ParsedPage, - ParsedPaginatedDocument, + SegmentedPage, ) +from docling_parse.pdf_parsers import pdf_parser_v2 # type: ignore[import] + class pdf_parser: - def __init__(self, loglevel:str = "fatal"): + def __init__(self, loglevel: str = "fatal"): """ Set the log level using a string label. @@ -47,7 +31,7 @@ def __init__(self, loglevel:str = "fatal"): """ self.parser = pdf_parser_v2(level=loglevel) - def set_loglevel_with_label(self, loglevel:str): + def set_loglevel_with_label(self, loglevel: str): """Set the log level using a string label. Parameters: @@ -56,8 +40,8 @@ def set_loglevel_with_label(self, loglevel:str): )") """ self.parser.set_loglevel_with_label(level=loglevel) - - def is_loaded(self, key:str) -> bool: + + def is_loaded(self, key: str) -> bool: """Check if a document with the given key is loaded. Parameters: @@ -67,7 +51,7 @@ def is_loaded(self, key:str) -> bool: bool: True if the document is loaded, False otherwise.)") """ return self.parser.is_loaded(key=key) - + def list_loaded_keys(self) -> List[str]: """List the keys of the loaded documents. @@ -75,9 +59,8 @@ def list_loaded_keys(self) -> List[str]: List[str]: A list of keys for the currently loaded documents.)") """ return self.parser.list_loaded_keys() - - - def load_document(self, key:str, filename:str) -> bool: + + def load_document(self, key: str, filename: str) -> bool: """Load a document by key and filename. Parameters: @@ -88,8 +71,8 @@ def load_document(self, key:str, filename:str) -> bool: bool: True if the document was successfully loaded, False otherwise.)") """ return self.parser.load_document(key=key, filename=filename) - - def load_document_from_bytesio(self, key:str, data:BytesIO) -> bool: + + def load_document_from_bytesio(self, key: str, data: BytesIO) -> bool: """Load a document by key from a BytesIO-like object. Parameters: @@ -100,8 +83,8 @@ def load_document_from_bytesio(self, key:str, data:BytesIO) -> bool: bool: True if the document was successfully loaded, False otherwise.)") """ return self.parser.load_document(key=key, bytes_io=data) - - def unload_document(self, key:str) -> bool: + + def unload_document(self, key: str) -> bool: """Unload a document by its unique key. Parameters: @@ -111,8 +94,8 @@ def unload_document(self, key:str) -> bool: bool: True if the document was successfully unloaded, False otherwise.)") """ return self.parser.unload_document(key) - - def number_of_pages(self, key:str) -> int: + + def number_of_pages(self, key: str) -> int: """Get the number of pages in the document identified by its unique key. Parameters: @@ -122,29 +105,42 @@ def number_of_pages(self, key:str) -> int: int: The number of pages in the document.)") """ return self.number_of_pages(key=key) - - def parse(self, key:str, page_no:int=-1, page_boundary: PageBoundaryLabel=PageBoundaryLabel.CROP) -> ParsedPaginatedDocument: + + def parse( + self, + key: str, + page_no: int = -1, + page_boundary: PageBoundaryLabel = PageBoundaryLabel.CROP, + ) -> ParsedPaginatedDocument: """ - Parse the PDF document identified by its unique key and return a JSON representation. + Parse the PDF document identified by its unique key and return a JSON representation. - Parameters: - key (str): The unique key of the document. - page_boundary (str): The page boundary specification for parsing. One of [`crop_box`, `media_box`]. + Parameters: + key (str): The unique key of the document. + page_boundary (str): The page boundary specification for parsing. One of [`crop_box`, `media_box`]. - Returns: - dict: A JSON representation of the parsed PDF document.)") + Returns: + dict: A JSON representation of the parsed PDF document.)") """ - if page_no==-1: - doc_dict = self.parser.parse_pdf_from_key(key=key, page_boundary=page_boundary.value) - return self._to_parsed_paginated_document(doc_dict = doc_dict) - - elif page_no>=1 and page_no<=self.get_number_of_pages(key): - doc_dict = self.parser.parse_pdf_from_key_on_page(key=key, page=page_no-1, page_boundary=page_boundary) - return self._to_parsed_paginated_document(doc_dict = doc_dict, page_no=page_no) + if page_no == -1: + doc_dict = self.parser.parse_pdf_from_key( + key=key, page_boundary=page_boundary.value + ) + return self._to_parsed_paginated_document(doc_dict=doc_dict) + + elif page_no >= 1 and page_no <= self.number_of_pages(key): + doc_dict = self.parser.parse_pdf_from_key_on_page( + key=key, page=page_no - 1, page_boundary=page_boundary + ) + return self._to_parsed_paginated_document( + doc_dict=doc_dict, page_no=page_no + ) else: - raise ValueError(f"incorrect page_no: {page_no} for key={key} (min:1, max:{self.get_number_of_pages(key)})") - + raise ValueError( + f"incorrect page_no: {page_no} for key={key} (min:1, max:{self.number_of_pages(key)})" + ) + def _to_dimension(self, dimension: dict) -> PageDimension: page_boundary: PageBoundaryLabel = PageBoundaryLabel(dimension["page_boundary"]) @@ -156,7 +152,7 @@ def _to_dimension(self, dimension: dict) -> PageDimension: t=dimension["bbox"][3], coord_origin=CoordOrigin.BOTTOMLEFT, ) - + rect = BoundingRectangle( r_x0=bbox.l, r_y0=bbox.b, @@ -168,7 +164,7 @@ def _to_dimension(self, dimension: dict) -> PageDimension: r_y3=bbox.t, coord_origin=CoordOrigin.BOTTOMLEFT, ) - + art_bbox = BoundingBox( l=dimension["rectangles"]["art-bbox"][0], b=dimension["rectangles"]["art-bbox"][1], @@ -176,7 +172,7 @@ def _to_dimension(self, dimension: dict) -> PageDimension: t=dimension["rectangles"]["art-bbox"][3], coord_origin=CoordOrigin.BOTTOMLEFT, ) - + media_bbox = BoundingBox( l=dimension["rectangles"]["media-bbox"][0], b=dimension["rectangles"]["media-bbox"][1], @@ -184,7 +180,7 @@ def _to_dimension(self, dimension: dict) -> PageDimension: t=dimension["rectangles"]["media-bbox"][3], coord_origin=CoordOrigin.BOTTOMLEFT, ) - + bleed_bbox = BoundingBox( l=dimension["rectangles"]["bleed-bbox"][0], b=dimension["rectangles"]["bleed-bbox"][1], @@ -192,7 +188,7 @@ def _to_dimension(self, dimension: dict) -> PageDimension: t=dimension["rectangles"]["bleed-bbox"][3], coord_origin=CoordOrigin.BOTTOMLEFT, ) - + trim_bbox = BoundingBox( l=dimension["rectangles"]["trim-bbox"][0], b=dimension["rectangles"]["trim-bbox"][1], @@ -200,7 +196,7 @@ def _to_dimension(self, dimension: dict) -> PageDimension: t=dimension["rectangles"]["trim-bbox"][3], coord_origin=CoordOrigin.BOTTOMLEFT, ) - + crop_bbox = BoundingBox( l=dimension["rectangles"]["crop-bbox"][0], b=dimension["rectangles"]["crop-bbox"][1], @@ -208,7 +204,7 @@ def _to_dimension(self, dimension: dict) -> PageDimension: t=dimension["rectangles"]["crop-bbox"][3], coord_origin=CoordOrigin.BOTTOMLEFT, ) - + return PageDimension( angle=dimension["angle"], page_boundary=dimension["page_boundary"], @@ -225,10 +221,10 @@ def _to_cells(self, cells: dict) -> List[PageCell]: assert "data" in cells, '"data" in cells' assert "header" in cells, '"header" in cells' - + data = cells["data"] header = cells["header"] - + result: List[PageCell] = [] for ind, row in enumerate(data): rect = BoundingRectangle( @@ -252,18 +248,17 @@ def _to_cells(self, cells: dict) -> List[PageCell]: rendering_mode="", ) result.append(cell) - - return result + return result def _to_images(self, images: dict) -> List[PageImage]: assert "data" in images, '"data" in images' assert "header" in images, '"header" in images' - + data = images["data"] header = images["header"] - + result: List[PageImage] = [] for ind, row in enumerate(data): rect = BoundingRectangle( @@ -278,9 +273,8 @@ def _to_images(self, images: dict) -> List[PageImage]: ) image = PageImage(rect=rect, uri=None) result.append(image) - - return result + return result def _to_lines(self, data: dict) -> List[PageLine]: @@ -291,15 +285,14 @@ def _to_lines(self, data: dict) -> List[PageLine]: i0: int = item["i"][l + 0] i1: int = item["i"][l + 1] - points: List[Tuple[float, float]] = [] + points: List[Tuple[float, float]] = [] for k in range(i0, i1): points.append((item["x"][k], item["y"][k])) - + line = PageLine(line_parent_id=l, points=points) result.append(line) - - return result + return result def _to_segmented_page(self, page: dict) -> SegmentedPage: @@ -310,7 +303,6 @@ def _to_segmented_page(self, page: dict) -> SegmentedPage: lines=self._to_lines(page["lines"]), ) - def _to_parsed_page(self, page: dict) -> ParsedPage: return ParsedPage( @@ -318,12 +310,13 @@ def _to_parsed_page(self, page: dict) -> ParsedPage: sanitized=self._to_segmented_page(page["sanitized"]), ) - def _to_parsed_paginated_document(self, doc_dict: dict,page_no: int = 1) -> ParsedPaginatedDocument: - + def _to_parsed_paginated_document( + self, doc_dict: dict, page_no: int = 1 + ) -> ParsedPaginatedDocument: + parsed_doc = ParsedPaginatedDocument() for pi, page in enumerate(doc_dict["pages"]): parsed_doc.pages[page_no + pi] = self._to_parsed_page(page) - + return parsed_doc - diff --git a/tests/test_parse.py b/tests/test_parse.py index e756512e..b9e73184 100644 --- a/tests/test_parse.py +++ b/tests/test_parse.py @@ -6,6 +6,7 @@ REGRESSION_FOLDER = "tests/data/regression/*.pdf" + def test_reference_documents_from_filenames_with_keys(): parser = pdf_parser(loglevel="fatal") @@ -18,15 +19,15 @@ def test_reference_documents_from_filenames_with_keys(): print(pdf_doc) doc_key = f"key={pdf_doc}" - - success = parser.load_document(key = doc_key, filename = pdf_doc) + + success = parser.load_document(key=doc_key, filename=pdf_doc) assert success - - doc = parser.parse(key = doc_key) + + doc = parser.parse(key=doc_key) for page_no, page in doc.pages.items(): - print( " -> ", page_no) + print(" -> ", page_no) res = page.original.render() res.show() diff --git a/tests/test_parse_v2.py b/tests/test_parse_v2.py index 8e5f1635..ff77c36a 100644 --- a/tests/test_parse_v2.py +++ b/tests/test_parse_v2.py @@ -13,10 +13,6 @@ import json import os -from docling_parse.document import ( - ParsedPaginatedDocument, - from_pdf_parser_v2_to_parsed_paginated_document, -) from docling_parse.pdf_parsers import pdf_parser_v2 # type: ignore[import] from docling_parse.utils import create_pil_image_of_page_v2 @@ -470,73 +466,3 @@ def test_sanitize_cells_in_bbox(): keys = parser.list_loaded_keys() assert len(keys) == 0, "len(keys)==0" - - -def test_paginated_document_conversion(): - - parser = pdf_parser_v2(level="fatal") - - pdf_docs = glob.glob(REGRESSION_FOLDER) - assert len(pdf_docs) > 0, "len(pdf_docs)==0 -> nothing to test" - - for pdf_doc in pdf_docs: - - doc_key = f"key={pdf_doc}" - - success = parser.load_document(doc_key, pdf_doc) - - keys = parser.list_loaded_keys() - assert len(keys) == 1, "len(keys)==1" - - num_pages = parser.number_of_pages(doc_key) - - for page in range(0, min(MAX_PAGES, num_pages)): - - rname = os.path.basename(pdf_doc) - fname = os.path.join(GROUNDTRUTH_FOLDER, f"{rname}.v2.p={page}.json") - - doc: dict = parser.parse_pdf_from_key_on_page(key=doc_key, page=page) - - paginated_doc: PaginatedDocument = ( - from_pdf_parser_v2_to_parsed_paginated_document(doc_dict=doc) - ) - - parser.unload_document(doc_key) - - -def test_paginated_document_rendering(): - - parser = pdf_parser_v2(level="fatal") - - pdf_docs = glob.glob(REGRESSION_FOLDER) - assert len(pdf_docs) > 0, "len(pdf_docs)==0 -> nothing to test" - - for pdf_doc in pdf_docs: - - doc_key = f"key={pdf_doc}" - - success = parser.load_document(doc_key, pdf_doc) - - keys = parser.list_loaded_keys() - assert len(keys) == 1, "len(keys)==1" - - num_pages = parser.number_of_pages(doc_key) - - for page in range(0, min(MAX_PAGES, num_pages)): - - rname = os.path.basename(pdf_doc) - fname = os.path.join(GROUNDTRUTH_FOLDER, f"{rname}.v2.p={page}.json") - - doc: dict = parser.parse_pdf_from_key_on_page(key=doc_key, page=page) - - paginated_doc: ParsedPaginatedDocument = ( - from_pdf_parser_v2_to_parsed_paginated_document(doc_dict=doc) - ) - - img = paginated_doc.pages[1].original.render() - img.show() - - img = paginated_doc.pages[1].sanitized.render() - img.show() - - parser.unload_document(doc_key)