From 4a84b70db93afc2e4a5941ab88ce731a28776688 Mon Sep 17 00:00:00 2001 From: David Huggins-Daines Date: Thu, 20 Feb 2025 09:07:54 -0500 Subject: [PATCH] feat: new APIs, flatten, extract_text, is_tagged (#62) * feat: new APIs, flatten, extract_text, is_tagged * fix: be robust when getting MarkInfo * feat: add extract_text benchmark --- CHANGELOG.md | 6 +- README.md | 13 +++- benchmarks/text.py | 25 +++++++- playa/cli.py | 106 +------------------------------- playa/document.py | 4 ++ playa/page.py | 134 +++++++++++++++++++++++++++++++++++++++++ tests/test_document.py | 14 +++++ 7 files changed, 191 insertions(+), 111 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dae04d8..6b00f5d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,7 @@ ## PLAYA 0.3.0: unreleased +- API for text extraction +- Extract text from XObjects with `playa --text` - Remove deprecated `LayoutDict` API and simplify code - Deprecate `annots` API and add friendly `annotations` - Elevate `resolve1` and `resolve_all` to top-level exports @@ -10,10 +12,12 @@ ### TODO - Deprecate `outlines` API and add tree-structured `outline` + - Link structured elements in outline to structure tree (lazily) - Deprecate `dests` API and add friendly `destinations` - Create friendly API for actions - Expose `XRef` API for users to look around in -- Format all `ContentObject` as JSON +- Methods for all `ContentObject` to format as JSON/dict + - Do not want to use Pydantic but be Pydantic-like ## PLAYA 0.2.10: 2025-02-18 - Fix serious bug in rare ' and " text operators diff --git a/README.md b/README.md index 1cd05eb..95e8150 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ The purpose of PLAYA is to provide an efficent, parallel and parallelizable, pure-Python and Pythonic (for its author's definition of the term), lazy interface to the internals of PDF files. -If you just want to extract text from a PDF, there are a better and/or +If you just want to extract text from a PDF, there are better and/or faster tools and libraries out there, notably [pypdfium2](https://pypi.org/project/pypdfium2/) and [pypdf](https://pypi.org/project/pypdf/), among others. See [these @@ -149,7 +149,7 @@ page = pdf.pages["42"] # or "logical" page number (also a string) print(f"Page {page.label} is {page.width} x {page.height}") ``` -Since PDF is at heard a page-oriented, presentation format, many types +Since PDF is at heart a page-oriented, presentation format, many types of metadata are mostly accessible via the page objects. For example, annotations (internal or external links) are only @@ -296,9 +296,16 @@ in the aforementioned interpretation of "device space"): ```python for obj in page: print(f"{obj.object_type} at {obj.bbox}") + + # With space="screen" (the default) left, top, right, bottom = obj.bbox print(f" top left is {left, top}") - print(f" bottom right is {right, botom}") + print(f" bottom right is {right, bottom}") + + # With space="page" or space="default" + left, bottom, right, top = obj.bbox + print(f" bottom left is {left, bottom}") + print(f" top right is {right, top}") ``` Another important piece of information (which `pdfminer.six` does not diff --git a/benchmarks/text.py b/benchmarks/text.py index 0bfba13..5f846c3 100644 --- a/benchmarks/text.py +++ b/benchmarks/text.py @@ -26,15 +26,34 @@ def benchmark_chars(path: Path): _ = obj.chars +def benchmark_text(path: Path): + """Extract text, sort of.""" + import playa + + if path.name in PDFMINER_BUGS or path.name in XFAILS: + return + passwords = PASSWORDS.get(path.name, [""]) + for password in passwords: + LOG.info("Reading %s", path) + with playa.open(path, password=password) as pdf: + for page in pdf.pages: + page.extract_text() + + if __name__ == "__main__": # Silence warnings about broken PDFs logging.basicConfig(level=logging.ERROR) niter = 5 - miner_time = beach_time = lazy_time = 0.0 + chars_time = text_time = 0.0 for iter in range(niter + 1): for path in BASEPDFS: start = time.time() benchmark_chars(path) if iter != 0: - lazy_time += time.time() - start - print("chars took %.2fs / iter" % (lazy_time / niter,)) + chars_time += time.time() - start + start = time.time() + benchmark_text(path) + if iter != 0: + text_time += time.time() - start + print("chars took %.2fs / iter" % (chars_time / niter,)) + print("extract_text took %.2fs / iter" % (text_time / niter,)) diff --git a/playa/cli.py b/playa/cli.py index 2d901f0..b8acb97 100644 --- a/playa/cli.py +++ b/playa/cli.py @@ -72,14 +72,12 @@ import itertools import json import logging -import textwrap from collections import deque from pathlib import Path from typing import Any, Deque, Dict, Iterable, Iterator, List, TextIO, Tuple, Union import playa from playa import Document, Page -from playa.page import MarkedContent, TextObject, XObjectObject from playa.pdftypes import ContentStream, ObjRef, resolve1 from playa.structure import Element, ContentObject as StructContentObject, ContentItem from playa.utils import decode_text @@ -313,112 +311,12 @@ def extract_page_contents(doc: Document, args: argparse.Namespace) -> None: args.outfile.buffer.write(data) -def get_text_from_obj(obj: TextObject, vertical: bool) -> Tuple[str, float]: - """Try to get text from a text object.""" - chars = [] - prev_end = 0.0 - for glyph in obj: - x, y = glyph.textstate.glyph_offset - off = y if vertical else x - # FIXME: This is a heuristic!!! - if prev_end and off - prev_end > 0.5: - chars.append(" ") - if glyph.text is not None: - chars.append(glyph.text) - prev_end = off + glyph.adv - return "".join(chars), prev_end - - -def get_all_texts(page: Union[Page, XObjectObject]) -> Iterator[TextObject]: - for obj in page: - if isinstance(obj, XObjectObject): - yield from get_all_texts(obj) - elif isinstance(obj, TextObject): - yield obj - - -def get_text_untagged(page: Page) -> str: - """Get text from a page of an untagged PDF.""" - prev_line_matrix = None - prev_end = 0.0 - lines = [] - strings = [] - for text in get_all_texts(page): - line_matrix = text.textstate.line_matrix - vertical = ( - False if text.textstate.font is None else text.textstate.font.vertical - ) - lpos = -2 if vertical else -1 - if prev_line_matrix is not None and line_matrix[lpos] < prev_line_matrix[lpos]: - lines.append("".join(strings)) - strings.clear() - wpos = -1 if vertical else -2 - if ( - prev_line_matrix is not None - and prev_end + prev_line_matrix[wpos] < line_matrix[wpos] - ): - strings.append(" ") - textstr, end = get_text_from_obj(text, vertical) - strings.append(textstr) - prev_line_matrix = line_matrix - prev_end = end - if strings: - lines.append("".join(strings)) - return "\n".join(lines) - - -def get_text_tagged(page: Page) -> str: - """Get text from a page of a tagged PDF.""" - lines: List[str] = [] - strings: List[str] = [] - at_mcs: Union[MarkedContent, None] = None - prev_mcid: Union[int, None] = None - for text in get_all_texts(page): - in_artifact = same_actual_text = reversed_chars = False - actual_text = None - for mcs in reversed(text.mcstack): - if mcs.tag == "Artifact": - in_artifact = True - break - actual_text = mcs.props.get("ActualText") - if actual_text is not None: - if mcs is at_mcs: - same_actual_text = True - at_mcs = mcs - break - if mcs.tag == "ReversedChars": - reversed_chars = True - break - if in_artifact or same_actual_text: - continue - if actual_text is None: - chars = text.chars - if reversed_chars: - chars = chars[::-1] - else: - assert isinstance(actual_text, bytes) - chars = actual_text.decode("UTF-16") - # Remove soft hyphens - chars = chars.replace("\xad", "") - # Insert a line break (FIXME: not really correct) - if text.mcid != prev_mcid: - lines.extend(textwrap.wrap("".join(strings))) - strings.clear() - prev_mcid = text.mcid - strings.append(chars) - if strings: - lines.extend(textwrap.wrap("".join(strings))) - return "\n".join(lines) - - def extract_text(doc: Document, args: argparse.Namespace) -> None: """Extract text, but not in any kind of fancy way.""" pages = decode_page_spec(doc, args.pages) - if "MarkInfo" not in doc.catalog or not doc.catalog["MarkInfo"].get("Marked"): + if not doc.is_tagged: LOG.warning("Document is not a tagged PDF, text may not be readable") - textor = doc.pages[pages].map(get_text_untagged) - else: - textor = doc.pages[pages].map(get_text_tagged) + textor = doc.pages[pages].map(Page.extract_text) for text in textor: print(text, file=args.outfile) diff --git a/playa/document.py b/playa/document.py index 0cf60c9..09f4e41 100644 --- a/playa/document.py +++ b/playa/document.py @@ -902,6 +902,10 @@ def __init__( self.pdf_version, ) self.pdf_version = self.catalog["Version"] + self.is_tagged = False + markinfo = resolve1(self.catalog.get("MarkInfo")) + if isinstance(markinfo, dict): + self.is_tagged = not not markinfo.get("Marked") def _initialize_password(self, password: str = "") -> None: """Initialize the decryption handler with a given password, if any. diff --git a/playa/page.py b/playa/page.py index 6f38bdf..edeec5e 100644 --- a/playa/page.py +++ b/playa/page.py @@ -5,6 +5,7 @@ import itertools import logging import re +import textwrap import warnings from copy import copy from dataclasses import dataclass @@ -20,8 +21,10 @@ Optional, Tuple, Type, + TypeVar, Union, cast, + overload, ) from playa.color import ( @@ -80,6 +83,7 @@ LITERAL_IMAGE = LIT("Image") TextSeq = Iterable[Union[int, float, bytes]] DeviceSpace = Literal["page", "screen", "default", "user"] +CO = TypeVar("CO") # FIXME: This should go in utils/pdftypes but there are circular imports @@ -93,6 +97,23 @@ def parse_rect(o: PDFObject) -> Rect: raise PDFSyntaxError("Rectangle contains non-numeric values") +# FIXME: This should be a method of TextObject (soon) +def _extract_text_from_obj(obj: "TextObject", vertical: bool) -> Tuple[str, float]: + """Try to get text from a text object.""" + chars = [] + prev_end = 0.0 + for glyph in obj: + x, y = glyph.textstate.glyph_offset + off = y if vertical else x + # FIXME: This is a heuristic!!! + if prev_end and off - prev_end > 0.5: + chars.append(" ") + if glyph.text is not None: + chars.append(glyph.text) + prev_end = off + glyph.adv + return "".join(chars), prev_end + + class Page: """An object that holds the information about a page. @@ -367,6 +388,119 @@ def structtree(self) -> StructTree: def __repr__(self) -> str: return f"" + @overload + def flatten(self) -> Iterator["ContentObject"]: ... + + @overload + def flatten(self, filter_class: Type[CO]) -> Iterator[CO]: ... + + def flatten( + self, filter_class: Union[None, Type[CO]] = None + ) -> Iterator[Union[CO, "ContentObject"]]: + """Iterate over content objects, recursing into form XObjects.""" + + def flatten_one(itor: Iterable["ContentObject"]) -> Iterator["ContentObject"]: + for obj in itor: + if isinstance(obj, XObjectObject): + yield from flatten_one(obj) + else: + yield obj + + if filter_class is None: + yield from flatten_one(self) + else: + for obj in flatten_one(self): + if isinstance(obj, filter_class): + yield obj + + def extract_text(self) -> str: + """Do some best-effort text extraction. + + This necessarily involves a few heuristics, so don't get your + hopes up. It will attempt to use marked content information + for a tagged PDF, otherwise it will fall back on the character + displacement and line matrix to determine word and line breaks. + """ + if self.doc.is_tagged: + return self.extract_text_tagged() + else: + return self.extract_text_untagged() + + def extract_text_untagged(self) -> str: + """Get text from a page of an untagged PDF.""" + prev_line_matrix = None + prev_end = 0.0 + lines = [] + strings = [] + for text in self.flatten(TextObject): + line_matrix = text.textstate.line_matrix + vertical = ( + False if text.textstate.font is None else text.textstate.font.vertical + ) + lpos = -2 if vertical else -1 + if ( + prev_line_matrix is not None + and line_matrix[lpos] < prev_line_matrix[lpos] + ): + lines.append("".join(strings)) + strings.clear() + wpos = -1 if vertical else -2 + if ( + prev_line_matrix is not None + and prev_end + prev_line_matrix[wpos] < line_matrix[wpos] + ): + strings.append(" ") + textstr, end = _extract_text_from_obj(text, vertical) + strings.append(textstr) + prev_line_matrix = line_matrix + prev_end = end + if strings: + lines.append("".join(strings)) + return "\n".join(lines) + + def extract_text_tagged(self) -> str: + """Get text from a page of a tagged PDF.""" + lines: List[str] = [] + strings: List[str] = [] + at_mcs: Union[MarkedContent, None] = None + prev_mcid: Union[int, None] = None + for text in self.flatten(TextObject): + in_artifact = same_actual_text = reversed_chars = False + actual_text = None + for mcs in reversed(text.mcstack): + if mcs.tag == "Artifact": + in_artifact = True + break + actual_text = mcs.props.get("ActualText") + if actual_text is not None: + if mcs is at_mcs: + same_actual_text = True + at_mcs = mcs + break + if mcs.tag == "ReversedChars": + reversed_chars = True + break + if in_artifact or same_actual_text: + continue + if actual_text is None: + chars = text.chars + if reversed_chars: + chars = chars[::-1] + else: + assert isinstance(actual_text, bytes) + chars = actual_text.decode("UTF-16") + # Remove soft hyphens + chars = chars.replace("\xad", "") + # Insert a line break (FIXME: not really correct) + if text.mcid != prev_mcid: + lines.extend(textwrap.wrap("".join(strings))) + strings.clear() + prev_mcid = text.mcid + strings.append(chars) + if strings: + lines.extend(textwrap.wrap("".join(strings))) + return "\n".join(lines) + @dataclass class Annotation: diff --git a/tests/test_document.py b/tests/test_document.py index ba0b1fe..0311ebd 100644 --- a/tests/test_document.py +++ b/tests/test_document.py @@ -10,6 +10,7 @@ from playa.data_structures import NameTree from playa.document import read_header, XRefTable from playa.exceptions import PDFSyntaxError +from playa.page import TextObject from playa.parser import LIT from playa.utils import decode_text from .data import CONTRIB, TESTDIR @@ -161,9 +162,22 @@ def test_xobjects() -> None: assert xobj.object_type == "xobject" assert len(list(xobj)) == 2 + for obj in page.flatten(): + assert obj.object_type != "xobject" + + for obj in page.flatten(TextObject): + assert isinstance(obj, TextObject) + def test_annotations() -> None: with playa.open(TESTDIR / "simple5.pdf") as doc: page = doc.pages[0] for annot in page.annotations: assert annot.page is page + + +def test_is_tagged() -> None: + with playa.open(TESTDIR / "simple1.pdf") as doc: + assert not doc.is_tagged + with playa.open(TESTDIR / "pdf_structure.pdf") as doc: + assert doc.is_tagged