From 4a84b70db93afc2e4a5941ab88ce731a28776688 Mon Sep 17 00:00:00 2001
From: David Huggins-Daines <dhd@ecolingui.ca>
Date: Thu, 20 Feb 2025 09:07:54 -0500
Subject: [PATCH] feat: new APIs, flatten, extract_text, is_tagged (#62)

* feat: new APIs, flatten, extract_text, is_tagged

* fix: be robust when getting MarkInfo

* feat: add extract_text benchmark
---
 CHANGELOG.md           |   6 +-
 README.md              |  13 +++-
 benchmarks/text.py     |  25 +++++++-
 playa/cli.py           | 106 +-------------------------------
 playa/document.py      |   4 ++
 playa/page.py          | 134 +++++++++++++++++++++++++++++++++++++++++
 tests/test_document.py |  14 +++++
 7 files changed, 191 insertions(+), 111 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index dae04d8..6b00f5d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,7 @@
 ## PLAYA 0.3.0: unreleased
 
+- API for text extraction
+- Extract text from XObjects with `playa --text`
 - Remove deprecated `LayoutDict` API and simplify code
 - Deprecate `annots` API and add friendly `annotations`
 - Elevate `resolve1` and `resolve_all` to top-level exports
@@ -10,10 +12,12 @@
 ### TODO
 
 - Deprecate `outlines` API and add tree-structured `outline`
+  - Link structured elements in outline to structure tree (lazily)
 - Deprecate `dests` API and add friendly `destinations`
 - Create friendly API for actions
 - Expose `XRef` API for users to look around in
-- Format all `ContentObject` as JSON
+- Methods for all `ContentObject` to format as JSON/dict
+  - Do not want to use Pydantic but be Pydantic-like
 
 ## PLAYA 0.2.10: 2025-02-18
 - Fix serious bug in rare ' and " text operators
diff --git a/README.md b/README.md
index 1cd05eb..95e8150 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@ The purpose of PLAYA is to provide an efficent, parallel and
 parallelizable, pure-Python and Pythonic (for its author's definition
 of the term), lazy interface to the internals of PDF files.
 
-If you just want to extract text from a PDF, there are a better and/or
+If you just want to extract text from a PDF, there are better and/or
 faster tools and libraries out there, notably
 [pypdfium2](https://pypi.org/project/pypdfium2/) and
 [pypdf](https://pypi.org/project/pypdf/), among others.  See [these
@@ -149,7 +149,7 @@ page = pdf.pages["42"]     # or "logical" page number (also a string)
 print(f"Page {page.label} is {page.width} x {page.height}")
 ```
 
-Since PDF is at heard a page-oriented, presentation format, many types
+Since PDF is at heart a page-oriented, presentation format, many types
 of metadata are mostly accessible via the page objects.
 
 For example, annotations (internal or external links) are only
@@ -296,9 +296,16 @@ in the aforementioned interpretation of "device space"):
 ```python
 for obj in page:
     print(f"{obj.object_type} at {obj.bbox}")
+
+    # With space="screen" (the default)
     left, top, right, bottom = obj.bbox
     print(f"  top left is {left, top}")
-    print(f"  bottom right is {right, botom}")
+    print(f"  bottom right is {right, bottom}")
+
+    # With space="page" or space="default"
+    left, bottom, right, top = obj.bbox
+    print(f"  bottom left is {left, bottom}")
+    print(f"  top right is {right, top}")
 ```
 
 Another important piece of information (which `pdfminer.six` does not
diff --git a/benchmarks/text.py b/benchmarks/text.py
index 0bfba13..5f846c3 100644
--- a/benchmarks/text.py
+++ b/benchmarks/text.py
@@ -26,15 +26,34 @@ def benchmark_chars(path: Path):
                     _ = obj.chars
 
 
+def benchmark_text(path: Path):
+    """Extract text, sort of."""
+    import playa
+
+    if path.name in PDFMINER_BUGS or path.name in XFAILS:
+        return
+    passwords = PASSWORDS.get(path.name, [""])
+    for password in passwords:
+        LOG.info("Reading %s", path)
+        with playa.open(path, password=password) as pdf:
+            for page in pdf.pages:
+                page.extract_text()
+
+
 if __name__ == "__main__":
     # Silence warnings about broken PDFs
     logging.basicConfig(level=logging.ERROR)
     niter = 5
-    miner_time = beach_time = lazy_time = 0.0
+    chars_time = text_time = 0.0
     for iter in range(niter + 1):
         for path in BASEPDFS:
             start = time.time()
             benchmark_chars(path)
             if iter != 0:
-                lazy_time += time.time() - start
-    print("chars took %.2fs / iter" % (lazy_time / niter,))
+                chars_time += time.time() - start
+            start = time.time()
+            benchmark_text(path)
+            if iter != 0:
+                text_time += time.time() - start
+    print("chars took %.2fs / iter" % (chars_time / niter,))
+    print("extract_text took %.2fs / iter" % (text_time / niter,))
diff --git a/playa/cli.py b/playa/cli.py
index 2d901f0..b8acb97 100644
--- a/playa/cli.py
+++ b/playa/cli.py
@@ -72,14 +72,12 @@
 import itertools
 import json
 import logging
-import textwrap
 from collections import deque
 from pathlib import Path
 from typing import Any, Deque, Dict, Iterable, Iterator, List, TextIO, Tuple, Union
 
 import playa
 from playa import Document, Page
-from playa.page import MarkedContent, TextObject, XObjectObject
 from playa.pdftypes import ContentStream, ObjRef, resolve1
 from playa.structure import Element, ContentObject as StructContentObject, ContentItem
 from playa.utils import decode_text
@@ -313,112 +311,12 @@ def extract_page_contents(doc: Document, args: argparse.Namespace) -> None:
         args.outfile.buffer.write(data)
 
 
-def get_text_from_obj(obj: TextObject, vertical: bool) -> Tuple[str, float]:
-    """Try to get text from a text object."""
-    chars = []
-    prev_end = 0.0
-    for glyph in obj:
-        x, y = glyph.textstate.glyph_offset
-        off = y if vertical else x
-        # FIXME: This is a heuristic!!!
-        if prev_end and off - prev_end > 0.5:
-            chars.append(" ")
-        if glyph.text is not None:
-            chars.append(glyph.text)
-        prev_end = off + glyph.adv
-    return "".join(chars), prev_end
-
-
-def get_all_texts(page: Union[Page, XObjectObject]) -> Iterator[TextObject]:
-    for obj in page:
-        if isinstance(obj, XObjectObject):
-            yield from get_all_texts(obj)
-        elif isinstance(obj, TextObject):
-            yield obj
-
-
-def get_text_untagged(page: Page) -> str:
-    """Get text from a page of an untagged PDF."""
-    prev_line_matrix = None
-    prev_end = 0.0
-    lines = []
-    strings = []
-    for text in get_all_texts(page):
-        line_matrix = text.textstate.line_matrix
-        vertical = (
-            False if text.textstate.font is None else text.textstate.font.vertical
-        )
-        lpos = -2 if vertical else -1
-        if prev_line_matrix is not None and line_matrix[lpos] < prev_line_matrix[lpos]:
-            lines.append("".join(strings))
-            strings.clear()
-        wpos = -1 if vertical else -2
-        if (
-            prev_line_matrix is not None
-            and prev_end + prev_line_matrix[wpos] < line_matrix[wpos]
-        ):
-            strings.append(" ")
-        textstr, end = get_text_from_obj(text, vertical)
-        strings.append(textstr)
-        prev_line_matrix = line_matrix
-        prev_end = end
-    if strings:
-        lines.append("".join(strings))
-    return "\n".join(lines)
-
-
-def get_text_tagged(page: Page) -> str:
-    """Get text from a page of a tagged PDF."""
-    lines: List[str] = []
-    strings: List[str] = []
-    at_mcs: Union[MarkedContent, None] = None
-    prev_mcid: Union[int, None] = None
-    for text in get_all_texts(page):
-        in_artifact = same_actual_text = reversed_chars = False
-        actual_text = None
-        for mcs in reversed(text.mcstack):
-            if mcs.tag == "Artifact":
-                in_artifact = True
-                break
-            actual_text = mcs.props.get("ActualText")
-            if actual_text is not None:
-                if mcs is at_mcs:
-                    same_actual_text = True
-                at_mcs = mcs
-                break
-            if mcs.tag == "ReversedChars":
-                reversed_chars = True
-                break
-        if in_artifact or same_actual_text:
-            continue
-        if actual_text is None:
-            chars = text.chars
-            if reversed_chars:
-                chars = chars[::-1]
-        else:
-            assert isinstance(actual_text, bytes)
-            chars = actual_text.decode("UTF-16")
-        # Remove soft hyphens
-        chars = chars.replace("\xad", "")
-        # Insert a line break (FIXME: not really correct)
-        if text.mcid != prev_mcid:
-            lines.extend(textwrap.wrap("".join(strings)))
-            strings.clear()
-            prev_mcid = text.mcid
-        strings.append(chars)
-    if strings:
-        lines.extend(textwrap.wrap("".join(strings)))
-    return "\n".join(lines)
-
-
 def extract_text(doc: Document, args: argparse.Namespace) -> None:
     """Extract text, but not in any kind of fancy way."""
     pages = decode_page_spec(doc, args.pages)
-    if "MarkInfo" not in doc.catalog or not doc.catalog["MarkInfo"].get("Marked"):
+    if not doc.is_tagged:
         LOG.warning("Document is not a tagged PDF, text may not be readable")
-        textor = doc.pages[pages].map(get_text_untagged)
-    else:
-        textor = doc.pages[pages].map(get_text_tagged)
+    textor = doc.pages[pages].map(Page.extract_text)
     for text in textor:
         print(text, file=args.outfile)
 
diff --git a/playa/document.py b/playa/document.py
index 0cf60c9..09f4e41 100644
--- a/playa/document.py
+++ b/playa/document.py
@@ -902,6 +902,10 @@ def __init__(
                 self.pdf_version,
             )
             self.pdf_version = self.catalog["Version"]
+        self.is_tagged = False
+        markinfo = resolve1(self.catalog.get("MarkInfo"))
+        if isinstance(markinfo, dict):
+            self.is_tagged = not not markinfo.get("Marked")
 
     def _initialize_password(self, password: str = "") -> None:
         """Initialize the decryption handler with a given password, if any.
diff --git a/playa/page.py b/playa/page.py
index 6f38bdf..edeec5e 100644
--- a/playa/page.py
+++ b/playa/page.py
@@ -5,6 +5,7 @@
 import itertools
 import logging
 import re
+import textwrap
 import warnings
 from copy import copy
 from dataclasses import dataclass
@@ -20,8 +21,10 @@
     Optional,
     Tuple,
     Type,
+    TypeVar,
     Union,
     cast,
+    overload,
 )
 
 from playa.color import (
@@ -80,6 +83,7 @@
 LITERAL_IMAGE = LIT("Image")
 TextSeq = Iterable[Union[int, float, bytes]]
 DeviceSpace = Literal["page", "screen", "default", "user"]
+CO = TypeVar("CO")
 
 
 # FIXME: This should go in utils/pdftypes but there are circular imports
@@ -93,6 +97,23 @@ def parse_rect(o: PDFObject) -> Rect:
         raise PDFSyntaxError("Rectangle contains non-numeric values")
 
 
+# FIXME: This should be a method of TextObject (soon)
+def _extract_text_from_obj(obj: "TextObject", vertical: bool) -> Tuple[str, float]:
+    """Try to get text from a text object."""
+    chars = []
+    prev_end = 0.0
+    for glyph in obj:
+        x, y = glyph.textstate.glyph_offset
+        off = y if vertical else x
+        # FIXME: This is a heuristic!!!
+        if prev_end and off - prev_end > 0.5:
+            chars.append(" ")
+        if glyph.text is not None:
+            chars.append(glyph.text)
+        prev_end = off + glyph.adv
+    return "".join(chars), prev_end
+
+
 class Page:
     """An object that holds the information about a page.
 
@@ -367,6 +388,119 @@ def structtree(self) -> StructTree:
     def __repr__(self) -> str:
         return f"<Page: Resources={self.resources!r}, MediaBox={self.mediabox!r}>"
 
+    @overload
+    def flatten(self) -> Iterator["ContentObject"]: ...
+
+    @overload
+    def flatten(self, filter_class: Type[CO]) -> Iterator[CO]: ...
+
+    def flatten(
+        self, filter_class: Union[None, Type[CO]] = None
+    ) -> Iterator[Union[CO, "ContentObject"]]:
+        """Iterate over content objects, recursing into form XObjects."""
+
+        def flatten_one(itor: Iterable["ContentObject"]) -> Iterator["ContentObject"]:
+            for obj in itor:
+                if isinstance(obj, XObjectObject):
+                    yield from flatten_one(obj)
+                else:
+                    yield obj
+
+        if filter_class is None:
+            yield from flatten_one(self)
+        else:
+            for obj in flatten_one(self):
+                if isinstance(obj, filter_class):
+                    yield obj
+
+    def extract_text(self) -> str:
+        """Do some best-effort text extraction.
+
+        This necessarily involves a few heuristics, so don't get your
+        hopes up.  It will attempt to use marked content information
+        for a tagged PDF, otherwise it will fall back on the character
+        displacement and line matrix to determine word and line breaks.
+        """
+        if self.doc.is_tagged:
+            return self.extract_text_tagged()
+        else:
+            return self.extract_text_untagged()
+
+    def extract_text_untagged(self) -> str:
+        """Get text from a page of an untagged PDF."""
+        prev_line_matrix = None
+        prev_end = 0.0
+        lines = []
+        strings = []
+        for text in self.flatten(TextObject):
+            line_matrix = text.textstate.line_matrix
+            vertical = (
+                False if text.textstate.font is None else text.textstate.font.vertical
+            )
+            lpos = -2 if vertical else -1
+            if (
+                prev_line_matrix is not None
+                and line_matrix[lpos] < prev_line_matrix[lpos]
+            ):
+                lines.append("".join(strings))
+                strings.clear()
+            wpos = -1 if vertical else -2
+            if (
+                prev_line_matrix is not None
+                and prev_end + prev_line_matrix[wpos] < line_matrix[wpos]
+            ):
+                strings.append(" ")
+            textstr, end = _extract_text_from_obj(text, vertical)
+            strings.append(textstr)
+            prev_line_matrix = line_matrix
+            prev_end = end
+        if strings:
+            lines.append("".join(strings))
+        return "\n".join(lines)
+
+    def extract_text_tagged(self) -> str:
+        """Get text from a page of a tagged PDF."""
+        lines: List[str] = []
+        strings: List[str] = []
+        at_mcs: Union[MarkedContent, None] = None
+        prev_mcid: Union[int, None] = None
+        for text in self.flatten(TextObject):
+            in_artifact = same_actual_text = reversed_chars = False
+            actual_text = None
+            for mcs in reversed(text.mcstack):
+                if mcs.tag == "Artifact":
+                    in_artifact = True
+                    break
+                actual_text = mcs.props.get("ActualText")
+                if actual_text is not None:
+                    if mcs is at_mcs:
+                        same_actual_text = True
+                    at_mcs = mcs
+                    break
+                if mcs.tag == "ReversedChars":
+                    reversed_chars = True
+                    break
+            if in_artifact or same_actual_text:
+                continue
+            if actual_text is None:
+                chars = text.chars
+                if reversed_chars:
+                    chars = chars[::-1]
+            else:
+                assert isinstance(actual_text, bytes)
+                chars = actual_text.decode("UTF-16")
+            # Remove soft hyphens
+            chars = chars.replace("\xad", "")
+            # Insert a line break (FIXME: not really correct)
+            if text.mcid != prev_mcid:
+                lines.extend(textwrap.wrap("".join(strings)))
+                strings.clear()
+                prev_mcid = text.mcid
+            strings.append(chars)
+        if strings:
+            lines.extend(textwrap.wrap("".join(strings)))
+        return "\n".join(lines)
+
 
 @dataclass
 class Annotation:
diff --git a/tests/test_document.py b/tests/test_document.py
index ba0b1fe..0311ebd 100644
--- a/tests/test_document.py
+++ b/tests/test_document.py
@@ -10,6 +10,7 @@
 from playa.data_structures import NameTree
 from playa.document import read_header, XRefTable
 from playa.exceptions import PDFSyntaxError
+from playa.page import TextObject
 from playa.parser import LIT
 from playa.utils import decode_text
 from .data import CONTRIB, TESTDIR
@@ -161,9 +162,22 @@ def test_xobjects() -> None:
         assert xobj.object_type == "xobject"
         assert len(list(xobj)) == 2
 
+        for obj in page.flatten():
+            assert obj.object_type != "xobject"
+
+        for obj in page.flatten(TextObject):
+            assert isinstance(obj, TextObject)
+
 
 def test_annotations() -> None:
     with playa.open(TESTDIR / "simple5.pdf") as doc:
         page = doc.pages[0]
         for annot in page.annotations:
             assert annot.page is page
+
+
+def test_is_tagged() -> None:
+    with playa.open(TESTDIR / "simple1.pdf") as doc:
+        assert not doc.is_tagged
+    with playa.open(TESTDIR / "pdf_structure.pdf") as doc:
+        assert doc.is_tagged