feat: new APIs, flatten, extract_text, is_tagged (#62)

* feat: new APIs, flatten, extract_text, is_tagged * fix: be robust when getting MarkInfo * feat: add extract_text benchmark
dhdaines · Feb 20, 2025 · 4a84b70 · 4a84b70
1 parent 4ce93f3
commit 4a84b70
Show file tree

Hide file tree

Showing 7 changed files with 191 additions and 111 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,7 @@
 ## PLAYA 0.3.0: unreleased
 
+- API for text extraction
+- Extract text from XObjects with `playa --text`
 - Remove deprecated `LayoutDict` API and simplify code
 - Deprecate `annots` API and add friendly `annotations`
 - Elevate `resolve1` and `resolve_all` to top-level exports
@@ -10,10 +12,12 @@
 ### TODO
 
 - Deprecate `outlines` API and add tree-structured `outline`
+  - Link structured elements in outline to structure tree (lazily)
 - Deprecate `dests` API and add friendly `destinations`
 - Create friendly API for actions
 - Expose `XRef` API for users to look around in
-- Format all `ContentObject` as JSON
+- Methods for all `ContentObject` to format as JSON/dict
+  - Do not want to use Pydantic but be Pydantic-like
 
 ## PLAYA 0.2.10: 2025-02-18
 - Fix serious bug in rare ' and " text operators

diff --git a/README.md b/README.md
@@ -18,7 +18,7 @@ The purpose of PLAYA is to provide an efficent, parallel and
 parallelizable, pure-Python and Pythonic (for its author's definition
 of the term), lazy interface to the internals of PDF files.
 
-If you just want to extract text from a PDF, there are a better and/or
+If you just want to extract text from a PDF, there are better and/or
 faster tools and libraries out there, notably
 [pypdfium2](https://pypi.org/project/pypdfium2/) and
 [pypdf](https://pypi.org/project/pypdf/), among others.  See [these
@@ -149,7 +149,7 @@ page = pdf.pages["42"]     # or "logical" page number (also a string)
 print(f"Page {page.label} is {page.width} x {page.height}")
 ```
 
-Since PDF is at heard a page-oriented, presentation format, many types
+Since PDF is at heart a page-oriented, presentation format, many types
 of metadata are mostly accessible via the page objects.
 
 For example, annotations (internal or external links) are only
@@ -296,9 +296,16 @@ in the aforementioned interpretation of "device space"):
 ```python
 for obj in page:
     print(f"{obj.object_type} at {obj.bbox}")
+
+    # With space="screen" (the default)
     left, top, right, bottom = obj.bbox
     print(f"  top left is {left, top}")
-    print(f"  bottom right is {right, botom}")
+    print(f"  bottom right is {right, bottom}")
+
+    # With space="page" or space="default"
+    left, bottom, right, top = obj.bbox
+    print(f"  bottom left is {left, bottom}")
+    print(f"  top right is {right, top}")
 ```
 
 Another important piece of information (which `pdfminer.six` does not

diff --git a/benchmarks/text.py b/benchmarks/text.py
@@ -26,15 +26,34 @@ def benchmark_chars(path: Path):
                     _ = obj.chars
 
 
+def benchmark_text(path: Path):
+    """Extract text, sort of."""
+    import playa
+
+    if path.name in PDFMINER_BUGS or path.name in XFAILS:
+        return
+    passwords = PASSWORDS.get(path.name, [""])
+    for password in passwords:
+        LOG.info("Reading %s", path)
+        with playa.open(path, password=password) as pdf:
+            for page in pdf.pages:
+                page.extract_text()
+
+
 if __name__ == "__main__":
     # Silence warnings about broken PDFs
     logging.basicConfig(level=logging.ERROR)
     niter = 5
-    miner_time = beach_time = lazy_time = 0.0
+    chars_time = text_time = 0.0
     for iter in range(niter + 1):
         for path in BASEPDFS:
             start = time.time()
             benchmark_chars(path)
             if iter != 0:
-                lazy_time += time.time() - start
-    print("chars took %.2fs / iter" % (lazy_time / niter,))
+                chars_time += time.time() - start
+            start = time.time()
+            benchmark_text(path)
+            if iter != 0:
+                text_time += time.time() - start
+    print("chars took %.2fs / iter" % (chars_time / niter,))
+    print("extract_text took %.2fs / iter" % (text_time / niter,))
diff --git a/playa/cli.py b/playa/cli.py
@@ -72,14 +72,12 @@
 import itertools
 import json
 import logging
-import textwrap
 from collections import deque
 from pathlib import Path
 from typing import Any, Deque, Dict, Iterable, Iterator, List, TextIO, Tuple, Union
 
 import playa
 from playa import Document, Page
-from playa.page import MarkedContent, TextObject, XObjectObject
 from playa.pdftypes import ContentStream, ObjRef, resolve1
 from playa.structure import Element, ContentObject as StructContentObject, ContentItem
 from playa.utils import decode_text
@@ -313,112 +311,12 @@ def extract_page_contents(doc: Document, args: argparse.Namespace) -> None:
         args.outfile.buffer.write(data)
 
 
-def get_text_from_obj(obj: TextObject, vertical: bool) -> Tuple[str, float]:
-    """Try to get text from a text object."""
-    chars = []
-    prev_end = 0.0
-    for glyph in obj:
-        x, y = glyph.textstate.glyph_offset
-        off = y if vertical else x
-        # FIXME: This is a heuristic!!!
-        if prev_end and off - prev_end > 0.5:
-            chars.append(" ")
-        if glyph.text is not None:
-            chars.append(glyph.text)
-        prev_end = off + glyph.adv
-    return "".join(chars), prev_end
-
-
-def get_all_texts(page: Union[Page, XObjectObject]) -> Iterator[TextObject]:
-    for obj in page:
-        if isinstance(obj, XObjectObject):
-            yield from get_all_texts(obj)
-        elif isinstance(obj, TextObject):
-            yield obj
-
-
-def get_text_untagged(page: Page) -> str:
-    """Get text from a page of an untagged PDF."""
-    prev_line_matrix = None
-    prev_end = 0.0
-    lines = []
-    strings = []
-    for text in get_all_texts(page):
-        line_matrix = text.textstate.line_matrix
-        vertical = (
-            False if text.textstate.font is None else text.textstate.font.vertical
-        )
-        lpos = -2 if vertical else -1
-        if prev_line_matrix is not None and line_matrix[lpos] < prev_line_matrix[lpos]:
-            lines.append("".join(strings))
-            strings.clear()
-        wpos = -1 if vertical else -2
-        if (
-            prev_line_matrix is not None
-            and prev_end + prev_line_matrix[wpos] < line_matrix[wpos]
-        ):
-            strings.append(" ")
-        textstr, end = get_text_from_obj(text, vertical)
-        strings.append(textstr)
-        prev_line_matrix = line_matrix
-        prev_end = end
-    if strings:
-        lines.append("".join(strings))
-    return "\n".join(lines)
-
-
-def get_text_tagged(page: Page) -> str:
-    """Get text from a page of a tagged PDF."""
-    lines: List[str] = []
-    strings: List[str] = []
-    at_mcs: Union[MarkedContent, None] = None
-    prev_mcid: Union[int, None] = None
-    for text in get_all_texts(page):
-        in_artifact = same_actual_text = reversed_chars = False
-        actual_text = None
-        for mcs in reversed(text.mcstack):
-            if mcs.tag == "Artifact":
-                in_artifact = True
-                break
-            actual_text = mcs.props.get("ActualText")
-            if actual_text is not None:
-                if mcs is at_mcs:
-                    same_actual_text = True
-                at_mcs = mcs
-                break
-            if mcs.tag == "ReversedChars":
-                reversed_chars = True
-                break
-        if in_artifact or same_actual_text:
-            continue
-        if actual_text is None:
-            chars = text.chars
-            if reversed_chars:
-                chars = chars[::-1]
-        else:
-            assert isinstance(actual_text, bytes)
-            chars = actual_text.decode("UTF-16")
-        # Remove soft hyphens
-        chars = chars.replace("\xad", "")
-        # Insert a line break (FIXME: not really correct)
-        if text.mcid != prev_mcid:
-            lines.extend(textwrap.wrap("".join(strings)))
-            strings.clear()
-            prev_mcid = text.mcid
-        strings.append(chars)
-    if strings:
-        lines.extend(textwrap.wrap("".join(strings)))
-    return "\n".join(lines)
-
-
 def extract_text(doc: Document, args: argparse.Namespace) -> None:
     """Extract text, but not in any kind of fancy way."""
     pages = decode_page_spec(doc, args.pages)
-    if "MarkInfo" not in doc.catalog or not doc.catalog["MarkInfo"].get("Marked"):
+    if not doc.is_tagged:
         LOG.warning("Document is not a tagged PDF, text may not be readable")
-        textor = doc.pages[pages].map(get_text_untagged)
-    else:
-        textor = doc.pages[pages].map(get_text_tagged)
+    textor = doc.pages[pages].map(Page.extract_text)
     for text in textor:
         print(text, file=args.outfile)
 

diff --git a/playa/document.py b/playa/document.py
@@ -902,6 +902,10 @@ def __init__(
                 self.pdf_version,
             )
             self.pdf_version = self.catalog["Version"]
+        self.is_tagged = False
+        markinfo = resolve1(self.catalog.get("MarkInfo"))
+        if isinstance(markinfo, dict):
+            self.is_tagged = not not markinfo.get("Marked")
 
     def _initialize_password(self, password: str = "") -> None:
         """Initialize the decryption handler with a given password, if any.