Skip to content

Commit

Permalink
feat: new APIs, flatten, extract_text, is_tagged (#62)
Browse files Browse the repository at this point in the history
* feat: new APIs, flatten, extract_text, is_tagged

* fix: be robust when getting MarkInfo

* feat: add extract_text benchmark
  • Loading branch information
dhdaines authored Feb 20, 2025
1 parent 4ce93f3 commit 4a84b70
Show file tree
Hide file tree
Showing 7 changed files with 191 additions and 111 deletions.
6 changes: 5 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
## PLAYA 0.3.0: unreleased

- API for text extraction
- Extract text from XObjects with `playa --text`
- Remove deprecated `LayoutDict` API and simplify code
- Deprecate `annots` API and add friendly `annotations`
- Elevate `resolve1` and `resolve_all` to top-level exports
Expand All @@ -10,10 +12,12 @@
### TODO

- Deprecate `outlines` API and add tree-structured `outline`
- Link structured elements in outline to structure tree (lazily)
- Deprecate `dests` API and add friendly `destinations`
- Create friendly API for actions
- Expose `XRef` API for users to look around in
- Format all `ContentObject` as JSON
- Methods for all `ContentObject` to format as JSON/dict
- Do not want to use Pydantic but be Pydantic-like

## PLAYA 0.2.10: 2025-02-18
- Fix serious bug in rare ' and " text operators
Expand Down
13 changes: 10 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ The purpose of PLAYA is to provide an efficent, parallel and
parallelizable, pure-Python and Pythonic (for its author's definition
of the term), lazy interface to the internals of PDF files.

If you just want to extract text from a PDF, there are a better and/or
If you just want to extract text from a PDF, there are better and/or
faster tools and libraries out there, notably
[pypdfium2](https://pypi.org/project/pypdfium2/) and
[pypdf](https://pypi.org/project/pypdf/), among others. See [these
Expand Down Expand Up @@ -149,7 +149,7 @@ page = pdf.pages["42"] # or "logical" page number (also a string)
print(f"Page {page.label} is {page.width} x {page.height}")
```

Since PDF is at heard a page-oriented, presentation format, many types
Since PDF is at heart a page-oriented, presentation format, many types
of metadata are mostly accessible via the page objects.

For example, annotations (internal or external links) are only
Expand Down Expand Up @@ -296,9 +296,16 @@ in the aforementioned interpretation of "device space"):
```python
for obj in page:
print(f"{obj.object_type} at {obj.bbox}")

# With space="screen" (the default)
left, top, right, bottom = obj.bbox
print(f" top left is {left, top}")
print(f" bottom right is {right, botom}")
print(f" bottom right is {right, bottom}")

# With space="page" or space="default"
left, bottom, right, top = obj.bbox
print(f" bottom left is {left, bottom}")
print(f" top right is {right, top}")
```

Another important piece of information (which `pdfminer.six` does not
Expand Down
25 changes: 22 additions & 3 deletions benchmarks/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,34 @@ def benchmark_chars(path: Path):
_ = obj.chars


def benchmark_text(path: Path):
"""Extract text, sort of."""
import playa

if path.name in PDFMINER_BUGS or path.name in XFAILS:
return
passwords = PASSWORDS.get(path.name, [""])
for password in passwords:
LOG.info("Reading %s", path)
with playa.open(path, password=password) as pdf:
for page in pdf.pages:
page.extract_text()


if __name__ == "__main__":
# Silence warnings about broken PDFs
logging.basicConfig(level=logging.ERROR)
niter = 5
miner_time = beach_time = lazy_time = 0.0
chars_time = text_time = 0.0
for iter in range(niter + 1):
for path in BASEPDFS:
start = time.time()
benchmark_chars(path)
if iter != 0:
lazy_time += time.time() - start
print("chars took %.2fs / iter" % (lazy_time / niter,))
chars_time += time.time() - start
start = time.time()
benchmark_text(path)
if iter != 0:
text_time += time.time() - start
print("chars took %.2fs / iter" % (chars_time / niter,))
print("extract_text took %.2fs / iter" % (text_time / niter,))
106 changes: 2 additions & 104 deletions playa/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,14 +72,12 @@
import itertools
import json
import logging
import textwrap
from collections import deque
from pathlib import Path
from typing import Any, Deque, Dict, Iterable, Iterator, List, TextIO, Tuple, Union

import playa
from playa import Document, Page
from playa.page import MarkedContent, TextObject, XObjectObject
from playa.pdftypes import ContentStream, ObjRef, resolve1
from playa.structure import Element, ContentObject as StructContentObject, ContentItem
from playa.utils import decode_text
Expand Down Expand Up @@ -313,112 +311,12 @@ def extract_page_contents(doc: Document, args: argparse.Namespace) -> None:
args.outfile.buffer.write(data)


def get_text_from_obj(obj: TextObject, vertical: bool) -> Tuple[str, float]:
"""Try to get text from a text object."""
chars = []
prev_end = 0.0
for glyph in obj:
x, y = glyph.textstate.glyph_offset
off = y if vertical else x
# FIXME: This is a heuristic!!!
if prev_end and off - prev_end > 0.5:
chars.append(" ")
if glyph.text is not None:
chars.append(glyph.text)
prev_end = off + glyph.adv
return "".join(chars), prev_end


def get_all_texts(page: Union[Page, XObjectObject]) -> Iterator[TextObject]:
for obj in page:
if isinstance(obj, XObjectObject):
yield from get_all_texts(obj)
elif isinstance(obj, TextObject):
yield obj


def get_text_untagged(page: Page) -> str:
"""Get text from a page of an untagged PDF."""
prev_line_matrix = None
prev_end = 0.0
lines = []
strings = []
for text in get_all_texts(page):
line_matrix = text.textstate.line_matrix
vertical = (
False if text.textstate.font is None else text.textstate.font.vertical
)
lpos = -2 if vertical else -1
if prev_line_matrix is not None and line_matrix[lpos] < prev_line_matrix[lpos]:
lines.append("".join(strings))
strings.clear()
wpos = -1 if vertical else -2
if (
prev_line_matrix is not None
and prev_end + prev_line_matrix[wpos] < line_matrix[wpos]
):
strings.append(" ")
textstr, end = get_text_from_obj(text, vertical)
strings.append(textstr)
prev_line_matrix = line_matrix
prev_end = end
if strings:
lines.append("".join(strings))
return "\n".join(lines)


def get_text_tagged(page: Page) -> str:
"""Get text from a page of a tagged PDF."""
lines: List[str] = []
strings: List[str] = []
at_mcs: Union[MarkedContent, None] = None
prev_mcid: Union[int, None] = None
for text in get_all_texts(page):
in_artifact = same_actual_text = reversed_chars = False
actual_text = None
for mcs in reversed(text.mcstack):
if mcs.tag == "Artifact":
in_artifact = True
break
actual_text = mcs.props.get("ActualText")
if actual_text is not None:
if mcs is at_mcs:
same_actual_text = True
at_mcs = mcs
break
if mcs.tag == "ReversedChars":
reversed_chars = True
break
if in_artifact or same_actual_text:
continue
if actual_text is None:
chars = text.chars
if reversed_chars:
chars = chars[::-1]
else:
assert isinstance(actual_text, bytes)
chars = actual_text.decode("UTF-16")
# Remove soft hyphens
chars = chars.replace("\xad", "")
# Insert a line break (FIXME: not really correct)
if text.mcid != prev_mcid:
lines.extend(textwrap.wrap("".join(strings)))
strings.clear()
prev_mcid = text.mcid
strings.append(chars)
if strings:
lines.extend(textwrap.wrap("".join(strings)))
return "\n".join(lines)


def extract_text(doc: Document, args: argparse.Namespace) -> None:
"""Extract text, but not in any kind of fancy way."""
pages = decode_page_spec(doc, args.pages)
if "MarkInfo" not in doc.catalog or not doc.catalog["MarkInfo"].get("Marked"):
if not doc.is_tagged:
LOG.warning("Document is not a tagged PDF, text may not be readable")
textor = doc.pages[pages].map(get_text_untagged)
else:
textor = doc.pages[pages].map(get_text_tagged)
textor = doc.pages[pages].map(Page.extract_text)
for text in textor:
print(text, file=args.outfile)

Expand Down
4 changes: 4 additions & 0 deletions playa/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -902,6 +902,10 @@ def __init__(
self.pdf_version,
)
self.pdf_version = self.catalog["Version"]
self.is_tagged = False
markinfo = resolve1(self.catalog.get("MarkInfo"))
if isinstance(markinfo, dict):
self.is_tagged = not not markinfo.get("Marked")

def _initialize_password(self, password: str = "") -> None:
"""Initialize the decryption handler with a given password, if any.
Expand Down
Loading

0 comments on commit 4a84b70

Please sign in to comment.