Skip to content

Commit 493785c

Browse files
ROB: Handle zero height fonts when extracting text (#3075)
Closes #3074.
1 parent afd7004 commit 493785c

File tree

2 files changed

+14
-2
lines changed

2 files changed

+14
-2
lines changed

pypdf/_text_extraction/_layout_mode/_fixed_width_page.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -365,8 +365,9 @@ def fixed_width_page(
365365
last_y_coord = 0
366366
for y_coord, line_data in ty_groups.items():
367367
if space_vertically and lines:
368-
blank_lines = (
369-
int(abs(y_coord - last_y_coord) / (line_data[0]["font_height"] * font_height_weight)) - 1
368+
fh = line_data[0]["font_height"]
369+
blank_lines = 0 if fh == 0 else (
370+
int(abs(y_coord - last_y_coord) / (fh * font_height_weight)) - 1
370371
)
371372
lines.extend([""] * blank_lines)
372373
line = ""

tests/test_text_extraction.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -320,3 +320,14 @@ def test_iss3060():
320320
# pypdf.errors.PdfReadError: font not set: is PDF missing a Tf operator?
321321
txt = reader.pages[0].extract_text(extraction_mode="layout")
322322
assert txt.startswith(" *******")
323+
324+
325+
@pytest.mark.enable_socket
326+
def test_iss3074():
327+
"""Test for not throwing 'ZeroDivisionError: float division by zero'"""
328+
url = "https://github.com/user-attachments/files/18533211/test-anon.pdf"
329+
name = "iss3074.pdf"
330+
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
331+
# pypdf.errors.PdfReadError: ZeroDivisionError: float division by zero
332+
txt = reader.pages[0].extract_text(extraction_mode="layout")
333+
assert txt.strip().startswith("AAAAAA")

0 commit comments

Comments
 (0)