Skip to content

Commit

Permalink
fix: extract text from XObjects too! (oops!)
Browse files Browse the repository at this point in the history
  • Loading branch information
dhdaines committed Feb 20, 2025
1 parent bff14d4 commit 958a09d
Showing 1 changed file with 11 additions and 3 deletions.
14 changes: 11 additions & 3 deletions playa/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@

import playa
from playa import Document, Page
from playa.page import MarkedContent, TextObject
from playa.page import MarkedContent, TextObject, XObjectObject
from playa.pdftypes import ContentStream, ObjRef, resolve1
from playa.structure import Element, ContentObject as StructContentObject, ContentItem
from playa.utils import decode_text
Expand Down Expand Up @@ -329,13 +329,21 @@ def get_text_from_obj(obj: TextObject, vertical: bool) -> Tuple[str, float]:
return "".join(chars), prev_end


def get_all_texts(page: Union[Page, XObjectObject]) -> Iterator[TextObject]:
for obj in page:
if isinstance(obj, XObjectObject):
yield from get_all_texts(obj)
elif isinstance(obj, TextObject):
yield obj


def get_text_untagged(page: Page) -> str:
"""Get text from a page of an untagged PDF."""
prev_line_matrix = None
prev_end = 0.0
lines = []
strings = []
for text in page.texts:
for text in get_all_texts(page):
line_matrix = text.textstate.line_matrix
vertical = (
False if text.textstate.font is None else text.textstate.font.vertical
Expand Down Expand Up @@ -365,7 +373,7 @@ def get_text_tagged(page: Page) -> str:
strings: List[str] = []
at_mcs: Union[MarkedContent, None] = None
prev_mcid: Union[int, None] = None
for text in page.texts:
for text in get_all_texts(page):
in_artifact = same_actual_text = reversed_chars = False
actual_text = None
for mcs in reversed(text.mcstack):
Expand Down

0 comments on commit 958a09d

Please sign in to comment.