Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix font size and rotated/skewed bounding boxes #39

Merged
merged 4 commits into from
Jan 2, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions TODO.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
## PLAYA 0.2.x
- [ ] Fix ToUnicode CMaps for CID fonts (file bug against pdfminer)
- [ ] Optimize text extraction
- [x] Optimize text extraction
- [ ] Support slices and lists in `PageList.__getitem__`
- [ ] Remove remaining dangerous `cast` usage
- [x] Remove remaining dangerous `cast` usage

## PLAYA 0.3.x
- [ ] remove `LayoutDict`
Expand Down
1 change: 1 addition & 0 deletions playa/ccitt.py
Original file line number Diff line number Diff line change
Expand Up @@ -568,6 +568,7 @@ def output_line(self, y: int, bits: Sequence[int]) -> None:

def ccittfaxdecode(data: bytes, params: Dict[str, object]) -> bytes:
from playa.pdftypes import int_value

K = params.get("K")
if K == -1:
cols = int_value(params.get("Columns"))
Expand Down
1 change: 1 addition & 0 deletions playa/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -973,6 +973,7 @@ def layout(self) -> Iterator[LayoutDict]:
DeprecationWarning,
)
from typing import cast

for idx, page in enumerate(self.pages):
for dic in page.layout:
dic = cast(LayoutDict, dic) # ugh
Expand Down
17 changes: 13 additions & 4 deletions playa/font.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,10 @@ def get_widths2(seq: Iterable[PDFObject]) -> Dict[int, Tuple[float, Point]]:
if r:
char1 = r[-1]
for i, (w, vx, vy) in enumerate(choplist(3, v)):
widths[int(char1) + i] = (num_value(w), (int_value(vx), int_value(vy)))
widths[int(char1) + i] = (
num_value(w),
(int_value(vx), int_value(vy)),
)
r = []
elif isinstance(v, (int, float)): # == utils.isnumber(v)
r.append(v)
Expand Down Expand Up @@ -685,7 +688,9 @@ def __init__(self, name: str, fp: BinaryIO) -> None:
if format == b"\x00":
# Format 0
n = self.nglyphs - 1
for gid, sid in enumerate(struct.unpack(">" + "H" * n, self.fp.read(2 * n))):
for gid, sid in enumerate(
struct.unpack(">" + "H" * n, self.fp.read(2 * n))
):
gid += 1
sidname = self.getstr(sid)
self.name2gid[sidname] = gid
Expand Down Expand Up @@ -724,7 +729,9 @@ def __init__(self, name: str, fp: BinaryIO) -> None:
try:
(ntables, _1, _2, _3) = struct.unpack(">HHHH", fp.read(8))
for _ in range(ntables):
(name_bytes, tsum, offset, length) = struct.unpack(">4sLLL", fp.read(16))
(name_bytes, tsum, offset, length) = struct.unpack(
">4sLLL", fp.read(16)
)
self.tables[name_bytes] = (offset, length)
except struct.error:
# Do not fail if there are not enough bytes to read. Even for
Expand Down Expand Up @@ -759,7 +766,9 @@ def create_unicode_map(self) -> FileUnicodeMap:
nhdrs = max(subheaderkeys) // 8 + 1
hdrs: List[Tuple[int, int, int, int, int]] = []
for i in range(nhdrs):
(firstcode, entcount, delta, offset) = struct.unpack(">HHhH", fp.read(8))
(firstcode, entcount, delta, offset) = struct.unpack(
">HHhH", fp.read(8)
)
hdrs.append((i, firstcode, entcount, delta, fp.tell() - 2 + offset))
for i, firstcode, entcount, delta, pos in hdrs:
if not entcount:
Expand Down
21 changes: 12 additions & 9 deletions playa/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -1623,12 +1623,11 @@ def render_char(
x1, y1 = (adv, descent + rise + fontsize)
(a, b, c, d, e, f) = matrix
upright = a * d * scaling > 0 and b * c <= 0
x0, y0, x1, y1 = get_transformed_bound(matrix, (x0, y0, x1, y1))
# NOTE: This is not right at all for rotated text, but we'll live with it
if font.vertical:
size = x1 - x0
size = abs(fontsize * a)
else:
size = y1 - y0
size = abs(fontsize * d)
x0, y0, x1, y1 = get_transformed_bound(matrix, (x0, y0, x1, y1))
glyph_x, glyph_y = apply_matrix_norm(self.ctm, self.textstate.glyph_offset)
item = LayoutDict(
object_type="char",
Expand Down Expand Up @@ -2042,10 +2041,14 @@ def bbox(self) -> Rect:
x1, y1 = (self.adv, descent + tstate.rise + tstate.fontsize)

if self.corners:
return get_bound((apply_matrix_pt(self.matrix, (x0, y0)),
apply_matrix_pt(self.matrix, (x0, y1)),
apply_matrix_pt(self.matrix, (x1, y1)),
apply_matrix_pt(self.matrix, (x1, y0)),))
return get_bound(
(
apply_matrix_pt(self.matrix, (x0, y0)),
apply_matrix_pt(self.matrix, (x0, y1)),
apply_matrix_pt(self.matrix, (x1, y1)),
apply_matrix_pt(self.matrix, (x1, y0)),
)
)
else:
x0, y0 = apply_matrix_pt(self.matrix, (x0, y0))
x1, y1 = apply_matrix_pt(self.matrix, (x1, y1))
Expand Down Expand Up @@ -2080,7 +2083,7 @@ def _render_string(self, item: TextItem) -> Iterator[GlyphObject]:
# Extract all the elements so we can translate efficiently
a, b, c, d, e, f = mult_matrix(tstate.line_matrix, self.ctm)
# Pre-determine if we need to recompute the bound for rotated glyphs
corners = a * b < 0 or d * c < 0
corners = b * d < 0 or a * c < 0
# Apply horizontal scaling
scaling = tstate.scaling * 0.01
charspace = tstate.charspace * scaling
Expand Down
8 changes: 4 additions & 4 deletions playa/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,20 +260,20 @@ def get_transformed_bound(matrix: Matrix, bbox: Rect) -> Rect:
"""Transform a bounding box and return the rectangle that covers
the points of the resulting shape."""
x0, y0, x1, y1 = bbox
# No rotation involved, corners are still valid. FIXME: proof
if matrix[0] * matrix[1] >= 0 and matrix[2] * matrix[3] >= 0:
# Corners are still valid.
if matrix[0] * matrix[2] < 0 or matrix[1] * matrix[3] < 0:
return get_bound(
(
apply_matrix_pt(matrix, (x0, y0)),
apply_matrix_pt(matrix, (x0, y1)),
apply_matrix_pt(matrix, (x1, y1)),
apply_matrix_pt(matrix, (x1, y0)),
)
)
return get_bound(
(
apply_matrix_pt(matrix, (x0, y0)),
apply_matrix_pt(matrix, (x0, y1)),
apply_matrix_pt(matrix, (x1, y1)),
apply_matrix_pt(matrix, (x1, y0)),
)
)

Expand Down
16 changes: 16 additions & 0 deletions tests/test_lazy_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,16 @@
Test the ContentObject API for pages.
"""

import itertools
from pathlib import Path
from typing import cast

import pytest

import playa
from playa.color import PREDEFINED_COLORSPACE, Color
from playa.exceptions import PDFEncryptionError
from playa.utils import get_transformed_bound, get_bound, apply_matrix_pt, Matrix

from .data import TESTDIR, ALLPDFS, PASSWORDS, XFAILS, CONTRIB

Expand Down Expand Up @@ -104,3 +107,16 @@ def test_rotated_glyphs() -> None:
width = x1 - x0
assert width > 6
assert "".join(chars) == "R18,00"


def test_rotated_bboxes() -> None:
"""Verify that rotated bboxes are correctly calculated."""
points = ((0, 0), (0, 100), (100, 100), (100, 0))
bbox = (0, 0, 100, 100)
# Test all possible sorts of CTM
vals = (-1, -0.5, 0, 0.5, 1)
for matrix in itertools.product(vals, repeat=4):
ctm = cast(Matrix, (*matrix, 0, 0))
gtb = get_transformed_bound(ctm, bbox)
bound = get_bound((apply_matrix_pt(ctm, p) for p in points))
assert gtb == bound
Loading