From 9e9abb0a347ce1c56550be6f38b223ca55118b3d Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Thu, 16 Jan 2025 05:50:57 +0100 Subject: [PATCH] fixed the tests and added PdfColoredElement Signed-off-by: Peter Staar --- docling_parse/document.py | 28 +++++++++++++++++----------- docling_parse/pdf_parser.py | 4 ++-- tests/test_parse.py | 2 +- 3 files changed, 20 insertions(+), 14 deletions(-) diff --git a/docling_parse/document.py b/docling_parse/document.py index 9b7455b2..7775d093 100644 --- a/docling_parse/document.py +++ b/docling_parse/document.py @@ -142,15 +142,17 @@ def to_top_left_origin(self, page_height: float) -> "BoundingRectangle": class PdfBaseElement(BaseModel): ordering: int +class PdfColoredElement(PdfBaseElement): + rgba: ColorRGBA = ColorRGBA(r=0, g=0, b=0, a=255) + -class PdfCell(PdfBaseElement): +class PdfCell(PdfColoredElement): rect: BoundingRectangle text: str orig: str - ordering: int rendering_mode: str font_key: str @@ -158,26 +160,30 @@ class PdfCell(PdfBaseElement): widget: bool - rgba: ColorRGBA = ColorRGBA(r=0, g=0, b=0, a=255) + def to_bottom_left_origin(self, page_height: float): + self.rect = self.rect.to_bottom_left_origin(page_height=page_height) + def to_top_left_origin(self, page_height: float): + self.rect = self.rect.to_top_left_origin(page_height=page_height) class PdfBitmapResource(PdfBaseElement): rect: BoundingRectangle uri: Optional[AnyUrl] + def to_bottom_left_origin(self, page_height: float): + self.rect = self.rect.to_bottom_left_origin(page_height=page_height) -class PdfLine(PdfBaseElement): + def to_top_left_origin(self, page_height: float): + self.rect = self.rect.to_top_left_origin(page_height=page_height) + +class PdfLine(PdfColoredElement): - # line_parent_id: int + parent_id: int points: List[Tuple[float, float]] - # line_parent_id: int - - coord_origin: CoordOrigin = CoordOrigin.BOTTOMLEFT - - # FIXME: could use something more sofisticated? - rgba: Tuple[int, int, int, int] = (0, 0, 0, 255) width: float = 1.0 + + coord_origin: CoordOrigin = CoordOrigin.BOTTOMLEFT def __len__(self) -> int: return len(self.points) diff --git a/docling_parse/pdf_parser.py b/docling_parse/pdf_parser.py index 5d801274..413e72ef 100644 --- a/docling_parse/pdf_parser.py +++ b/docling_parse/pdf_parser.py @@ -237,8 +237,8 @@ def _to_lines(self, data: dict) -> List[PdfLine]: line = PdfLine( ordering=ind, - points=points, - # line_parent_id=l, + parent_id=l, + points=points, ) result.append(line) diff --git a/tests/test_parse.py b/tests/test_parse.py index 4b8730f0..64b6eb38 100644 --- a/tests/test_parse.py +++ b/tests/test_parse.py @@ -23,7 +23,7 @@ def test_reference_documents_from_filenames(): print(pdf_doc_path) pdf_doc: PdfDocument = parser.load( - pagth_or_stream=pdf_doc_path, + path_or_stream=pdf_doc_path, boundary_type=PageBoundaryType.CROP_BOX, # default: CROP_BOX lazy=False, ) # default: True