From e5a3c3198a92876f206ed0ac982d4ffa18f48518 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Thu, 2 Jan 2025 11:47:11 -0500 Subject: [PATCH 1/4] fix: get the font size correct (easier than it looked at first glance) --- playa/page.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/playa/page.py b/playa/page.py index f3d182e..6920a60 100644 --- a/playa/page.py +++ b/playa/page.py @@ -1623,12 +1623,11 @@ def render_char( x1, y1 = (adv, descent + rise + fontsize) (a, b, c, d, e, f) = matrix upright = a * d * scaling > 0 and b * c <= 0 - x0, y0, x1, y1 = get_transformed_bound(matrix, (x0, y0, x1, y1)) - # NOTE: This is not right at all for rotated text, but we'll live with it if font.vertical: - size = x1 - x0 + size = abs(fontsize * a) else: - size = y1 - y0 + size = abs(fontsize * d) + x0, y0, x1, y1 = get_transformed_bound(matrix, (x0, y0, x1, y1)) glyph_x, glyph_y = apply_matrix_norm(self.ctm, self.textstate.glyph_offset) item = LayoutDict( object_type="char", From 08314fecc36b97a1b7a676a1032aed03cd99f3d1 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Thu, 2 Jan 2025 12:30:44 -0500 Subject: [PATCH 2/4] fix: really fix and test get_transformed_bound and company --- TODO.md | 4 ++-- playa/page.py | 2 +- playa/utils.py | 8 ++++---- tests/test_lazy_api.py | 15 +++++++++++++++ 4 files changed, 22 insertions(+), 7 deletions(-) diff --git a/TODO.md b/TODO.md index 0874175..4986932 100644 --- a/TODO.md +++ b/TODO.md @@ -1,8 +1,8 @@ ## PLAYA 0.2.x - [ ] Fix ToUnicode CMaps for CID fonts (file bug against pdfminer) -- [ ] Optimize text extraction +- [x] Optimize text extraction - [ ] Support slices and lists in `PageList.__getitem__` -- [ ] Remove remaining dangerous `cast` usage +- [x] Remove remaining dangerous `cast` usage ## PLAYA 0.3.x - [ ] remove `LayoutDict` diff --git a/playa/page.py b/playa/page.py index 6920a60..6a4a407 100644 --- a/playa/page.py +++ b/playa/page.py @@ -2079,7 +2079,7 @@ def _render_string(self, item: TextItem) -> Iterator[GlyphObject]: # Extract all the elements so we can translate efficiently a, b, c, d, e, f = mult_matrix(tstate.line_matrix, self.ctm) # Pre-determine if we need to recompute the bound for rotated glyphs - corners = a * b < 0 or d * c < 0 + corners = b * d < 0 or a * c < 0 # Apply horizontal scaling scaling = tstate.scaling * 0.01 charspace = tstate.charspace * scaling diff --git a/playa/utils.py b/playa/utils.py index bbaf9e7..895e7f6 100644 --- a/playa/utils.py +++ b/playa/utils.py @@ -260,20 +260,20 @@ def get_transformed_bound(matrix: Matrix, bbox: Rect) -> Rect: """Transform a bounding box and return the rectangle that covers the points of the resulting shape.""" x0, y0, x1, y1 = bbox - # No rotation involved, corners are still valid. FIXME: proof - if matrix[0] * matrix[1] >= 0 and matrix[2] * matrix[3] >= 0: + # Corners are still valid. + if matrix[0] * matrix[2] < 0 or matrix[1] * matrix[3] < 0: return get_bound( ( apply_matrix_pt(matrix, (x0, y0)), + apply_matrix_pt(matrix, (x0, y1)), apply_matrix_pt(matrix, (x1, y1)), + apply_matrix_pt(matrix, (x1, y0)), ) ) return get_bound( ( apply_matrix_pt(matrix, (x0, y0)), - apply_matrix_pt(matrix, (x0, y1)), apply_matrix_pt(matrix, (x1, y1)), - apply_matrix_pt(matrix, (x1, y0)), ) ) diff --git a/tests/test_lazy_api.py b/tests/test_lazy_api.py index 2615ad0..52f8e3f 100644 --- a/tests/test_lazy_api.py +++ b/tests/test_lazy_api.py @@ -2,6 +2,7 @@ Test the ContentObject API for pages. """ +import itertools from pathlib import Path import pytest @@ -9,6 +10,7 @@ import playa from playa.color import PREDEFINED_COLORSPACE, Color from playa.exceptions import PDFEncryptionError +from playa.utils import get_transformed_bound, get_bound, apply_matrix_pt from .data import TESTDIR, ALLPDFS, PASSWORDS, XFAILS, CONTRIB @@ -104,3 +106,16 @@ def test_rotated_glyphs() -> None: width = x1 - x0 assert width > 6 assert "".join(chars) == "R18,00" + + +def test_rotated_bboxes() -> None: + """Verify that rotated bboxes are correctly calculated.""" + points = ((0, 0), (0, 100), (100, 100), (100, 0)) + bbox = (0, 0, 100, 100) + # Test all possible sorts of CTM + vals = (-1, -0.5, 0, 0.5, 1) + for matrix in itertools.product(vals, repeat=4): + ctm = (*matrix, 0, 0) + gtb = get_transformed_bound(ctm, bbox) + bound = get_bound((apply_matrix_pt(ctm, p) for p in points)) + assert gtb == bound From 411fec065f5bf79082a32526a009ae7f5bd66c55 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Thu, 2 Jan 2025 12:35:26 -0500 Subject: [PATCH 3/4] chore: ruff --- playa/ccitt.py | 1 + playa/document.py | 1 + playa/font.py | 17 +++++++++++++---- playa/page.py | 12 ++++++++---- 4 files changed, 23 insertions(+), 8 deletions(-) diff --git a/playa/ccitt.py b/playa/ccitt.py index 6e7b46f..add6f9b 100644 --- a/playa/ccitt.py +++ b/playa/ccitt.py @@ -568,6 +568,7 @@ def output_line(self, y: int, bits: Sequence[int]) -> None: def ccittfaxdecode(data: bytes, params: Dict[str, object]) -> bytes: from playa.pdftypes import int_value + K = params.get("K") if K == -1: cols = int_value(params.get("Columns")) diff --git a/playa/document.py b/playa/document.py index d10b3ab..a56bb6f 100644 --- a/playa/document.py +++ b/playa/document.py @@ -973,6 +973,7 @@ def layout(self) -> Iterator[LayoutDict]: DeprecationWarning, ) from typing import cast + for idx, page in enumerate(self.pages): for dic in page.layout: dic = cast(LayoutDict, dic) # ugh diff --git a/playa/font.py b/playa/font.py index 861c821..0227a29 100644 --- a/playa/font.py +++ b/playa/font.py @@ -95,7 +95,10 @@ def get_widths2(seq: Iterable[PDFObject]) -> Dict[int, Tuple[float, Point]]: if r: char1 = r[-1] for i, (w, vx, vy) in enumerate(choplist(3, v)): - widths[int(char1) + i] = (num_value(w), (int_value(vx), int_value(vy))) + widths[int(char1) + i] = ( + num_value(w), + (int_value(vx), int_value(vy)), + ) r = [] elif isinstance(v, (int, float)): # == utils.isnumber(v) r.append(v) @@ -685,7 +688,9 @@ def __init__(self, name: str, fp: BinaryIO) -> None: if format == b"\x00": # Format 0 n = self.nglyphs - 1 - for gid, sid in enumerate(struct.unpack(">" + "H" * n, self.fp.read(2 * n))): + for gid, sid in enumerate( + struct.unpack(">" + "H" * n, self.fp.read(2 * n)) + ): gid += 1 sidname = self.getstr(sid) self.name2gid[sidname] = gid @@ -724,7 +729,9 @@ def __init__(self, name: str, fp: BinaryIO) -> None: try: (ntables, _1, _2, _3) = struct.unpack(">HHHH", fp.read(8)) for _ in range(ntables): - (name_bytes, tsum, offset, length) = struct.unpack(">4sLLL", fp.read(16)) + (name_bytes, tsum, offset, length) = struct.unpack( + ">4sLLL", fp.read(16) + ) self.tables[name_bytes] = (offset, length) except struct.error: # Do not fail if there are not enough bytes to read. Even for @@ -759,7 +766,9 @@ def create_unicode_map(self) -> FileUnicodeMap: nhdrs = max(subheaderkeys) // 8 + 1 hdrs: List[Tuple[int, int, int, int, int]] = [] for i in range(nhdrs): - (firstcode, entcount, delta, offset) = struct.unpack(">HHhH", fp.read(8)) + (firstcode, entcount, delta, offset) = struct.unpack( + ">HHhH", fp.read(8) + ) hdrs.append((i, firstcode, entcount, delta, fp.tell() - 2 + offset)) for i, firstcode, entcount, delta, pos in hdrs: if not entcount: diff --git a/playa/page.py b/playa/page.py index 6a4a407..6107f4c 100644 --- a/playa/page.py +++ b/playa/page.py @@ -2041,10 +2041,14 @@ def bbox(self) -> Rect: x1, y1 = (self.adv, descent + tstate.rise + tstate.fontsize) if self.corners: - return get_bound((apply_matrix_pt(self.matrix, (x0, y0)), - apply_matrix_pt(self.matrix, (x0, y1)), - apply_matrix_pt(self.matrix, (x1, y1)), - apply_matrix_pt(self.matrix, (x1, y0)),)) + return get_bound( + ( + apply_matrix_pt(self.matrix, (x0, y0)), + apply_matrix_pt(self.matrix, (x0, y1)), + apply_matrix_pt(self.matrix, (x1, y1)), + apply_matrix_pt(self.matrix, (x1, y0)), + ) + ) else: x0, y0 = apply_matrix_pt(self.matrix, (x0, y0)) x1, y1 = apply_matrix_pt(self.matrix, (x1, y1)) From 637cba9cb7eefdb86f42aa579bd9ebf2f0771bdc Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Thu, 2 Jan 2025 12:37:22 -0500 Subject: [PATCH 4/4] chore: mypy --- tests/test_lazy_api.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_lazy_api.py b/tests/test_lazy_api.py index 52f8e3f..847c548 100644 --- a/tests/test_lazy_api.py +++ b/tests/test_lazy_api.py @@ -4,13 +4,14 @@ import itertools from pathlib import Path +from typing import cast import pytest import playa from playa.color import PREDEFINED_COLORSPACE, Color from playa.exceptions import PDFEncryptionError -from playa.utils import get_transformed_bound, get_bound, apply_matrix_pt +from playa.utils import get_transformed_bound, get_bound, apply_matrix_pt, Matrix from .data import TESTDIR, ALLPDFS, PASSWORDS, XFAILS, CONTRIB @@ -115,7 +116,7 @@ def test_rotated_bboxes() -> None: # Test all possible sorts of CTM vals = (-1, -0.5, 0, 0.5, 1) for matrix in itertools.product(vals, repeat=4): - ctm = (*matrix, 0, 0) + ctm = cast(Matrix, (*matrix, 0, 0)) gtb = get_transformed_bound(ctm, bbox) bound = get_bound((apply_matrix_pt(ctm, p) for p in points)) assert gtb == bound