Skip to content

Commit

Permalink
fixed the tests and added PdfColoredElement
Browse files Browse the repository at this point in the history
Signed-off-by: Peter Staar <[email protected]>
  • Loading branch information
PeterStaar-IBM committed Jan 16, 2025
1 parent 2950c3f commit 9e9abb0
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 14 deletions.
28 changes: 17 additions & 11 deletions docling_parse/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,42 +142,48 @@ def to_top_left_origin(self, page_height: float) -> "BoundingRectangle":
class PdfBaseElement(BaseModel):
ordering: int

class PdfColoredElement(PdfBaseElement):
rgba: ColorRGBA = ColorRGBA(r=0, g=0, b=0, a=255)


class PdfCell(PdfBaseElement):
class PdfCell(PdfColoredElement):

rect: BoundingRectangle

text: str
orig: str

ordering: int
rendering_mode: str

font_key: str
font_name: str

widget: bool

rgba: ColorRGBA = ColorRGBA(r=0, g=0, b=0, a=255)
def to_bottom_left_origin(self, page_height: float):
self.rect = self.rect.to_bottom_left_origin(page_height=page_height)

def to_top_left_origin(self, page_height: float):
self.rect = self.rect.to_top_left_origin(page_height=page_height)

class PdfBitmapResource(PdfBaseElement):

rect: BoundingRectangle
uri: Optional[AnyUrl]

def to_bottom_left_origin(self, page_height: float):
self.rect = self.rect.to_bottom_left_origin(page_height=page_height)

class PdfLine(PdfBaseElement):
def to_top_left_origin(self, page_height: float):
self.rect = self.rect.to_top_left_origin(page_height=page_height)

class PdfLine(PdfColoredElement):

# line_parent_id: int
parent_id: int
points: List[Tuple[float, float]]
# line_parent_id: int

coord_origin: CoordOrigin = CoordOrigin.BOTTOMLEFT

# FIXME: could use something more sofisticated?
rgba: Tuple[int, int, int, int] = (0, 0, 0, 255)
width: float = 1.0

coord_origin: CoordOrigin = CoordOrigin.BOTTOMLEFT

def __len__(self) -> int:
return len(self.points)
Expand Down
4 changes: 2 additions & 2 deletions docling_parse/pdf_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,8 +237,8 @@ def _to_lines(self, data: dict) -> List[PdfLine]:

line = PdfLine(
ordering=ind,
points=points,
# line_parent_id=l,
parent_id=l,
points=points,
)
result.append(line)

Expand Down
2 changes: 1 addition & 1 deletion tests/test_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def test_reference_documents_from_filenames():
print(pdf_doc_path)

pdf_doc: PdfDocument = parser.load(
pagth_or_stream=pdf_doc_path,
path_or_stream=pdf_doc_path,
boundary_type=PageBoundaryType.CROP_BOX, # default: CROP_BOX
lazy=False,
) # default: True
Expand Down

0 comments on commit 9e9abb0

Please sign in to comment.