Skip to content

Commit 08b4e8d

Browse files
committed
more updates
Support pymupdf_layout
1 parent 8f34325 commit 08b4e8d

File tree

6 files changed

+695
-269
lines changed

6 files changed

+695
-269
lines changed

src/__init__.py

Lines changed: 56 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
import zipfile
2929

3030
from . import extra
31-
31+
import importlib.util
3232

3333
# Set up g_out_log and g_out_message from environment variables.
3434
#
@@ -333,6 +333,29 @@ def __init__(self):
333333

334334
_globals = _Globals()
335335

336+
_get_layout: typing.Optional[typing.Callable] = None
337+
_recommend_layout = True
338+
339+
340+
def no_recommend_layout():
341+
global _recommend_layout
342+
_recommend_layout = False
343+
344+
345+
def _warn_layout_once():
346+
msg="""Consider using the pymupdf_layout package for a greatly improved page layout analysis."""
347+
348+
global _recommend_layout
349+
if (
350+
1
351+
and _recommend_layout
352+
and not callable(_get_layout)
353+
and os.getenv("PYMUPDF_SUGGEST_LAYOUT_ANALYZER") != "0"
354+
and not importlib.util.find_spec("pymupdf.layout")
355+
):
356+
print(msg)
357+
_recommend_layout = False
358+
336359

337360
# Optionally use MuPDF via cppyy bindings; experimental and not tested recently
338361
# as of 2023-01-20 11:51:40
@@ -1054,6 +1077,7 @@ def get_textpage(self, clip=None, flags=0):
10541077
annot = self.this
10551078
stextpage = mupdf.FzStextPage(annot, options)
10561079
ret = TextPage(stextpage)
1080+
ret._dev_flags = flags
10571081
p = self.get_parent()
10581082
if isinstance(p, weakref.ProxyType):
10591083
ret.parent = p
@@ -2784,6 +2808,7 @@ def get_textpage(self, flags=3):
27842808
stext_options.flags = flags
27852809
val = mupdf.FzStextPage(self.this, stext_options)
27862810
val.thisown = True
2811+
val._dev_flags = flags
27872812
return val
27882813

27892814
@property
@@ -9952,9 +9977,10 @@ def _get_resource_properties(self):
99529977
return rc
99539978

99549979
def _get_textpage(self, clip=None, flags=0, matrix=None):
9955-
if g_use_extra:
9980+
if 1 or g_use_extra:
99569981
ll_tpage = extra.page_get_textpage(self.this, clip, flags, matrix)
99579982
tpage = mupdf.FzStextPage(ll_tpage)
9983+
tpage._dev_flags = flags
99589984
return tpage
99599985
page = self.this
99609986
options = mupdf.FzStextOptions(flags)
@@ -10781,6 +10807,20 @@ def clip_to_rect(self, rect):
1078110807
pclip = JM_rect_from_py(clip)
1078210808
mupdf.pdf_clip_page(pdfpage, pclip)
1078310809

10810+
def get_layout(self, vertical_gap=12):
10811+
"""Try to access layout information."""
10812+
10813+
if self.layout_information is not None:
10814+
# layout information already present
10815+
return
10816+
10817+
if not _get_layout:
10818+
# no layout information available
10819+
return
10820+
10821+
layout_info = _get_layout(self)
10822+
self.layout_information = layout_info
10823+
1078410824
@property
1078510825
def artbox(self):
1078610826
"""The ArtBox"""
@@ -11432,7 +11472,7 @@ def get_cdrawings(self, extended=None, callback=None, method=None):
1143211472
assert isinstance(page, mupdf.FzPage), f'{self.this=}'
1143311473
clips = True if extended else False
1143411474
prect = mupdf.fz_bound_page(page)
11435-
if g_use_extra:
11475+
if 1 or g_use_extra:
1143611476
rc = extra.get_cdrawings(page, extended, callback, method)
1143711477
else:
1143811478
rc = list()
@@ -12146,6 +12186,7 @@ def get_textpage(self, clip: rect_like = None, flags: int = 0, matrix=None) -> "
1214612186
if old_rotation != 0:
1214712187
self.set_rotation(old_rotation)
1214812188
textpage = TextPage(textpage)
12189+
textpage._dev_flags = flags
1214912190
textpage.parent = weakref.proxy(self)
1215012191
return textpage
1215112192

@@ -12157,7 +12198,7 @@ def get_texttrace(self):
1215712198
self.set_rotation(0)
1215812199
page = self.this
1215912200
rc = []
12160-
if g_use_extra:
12201+
if 1 or g_use_extra:
1216112202
dev = extra.JM_new_texttrace_device(rc)
1216212203
else:
1216312204
dev = JM_new_texttrace_device(rc)
@@ -13206,6 +13247,9 @@ def xref(self):
1320613247

1320713248
rect = property(bound, doc="page rectangle")
1320813249

13250+
# any result of layout analysis is stored here
13251+
layout_information: typing.Optional[typing.List[tuple]] = None
13252+
1320913253

1321013254
class Pixmap:
1321113255

@@ -16391,7 +16435,7 @@ def _textpage_dict(self, raw=False):
1639116435

1639216436
def extractBLOCKS(self):
1639316437
"""Return a list with text block information."""
16394-
if g_use_extra:
16438+
if 1 or g_use_extra:
1639516439
return extra.extractBLOCKS(self.this)
1639616440
block_n = -1
1639716441
this_tpage = self.this
@@ -16587,7 +16631,7 @@ def extractTextbox(self, rect):
1658716631

1658816632
def extractWORDS(self, delimiters=None):
1658916633
"""Return a list with text word information."""
16590-
if g_use_extra:
16634+
if 1 or g_use_extra:
1659116635
return extra.extractWORDS(self.this, delimiters)
1659216636
buflen = 0
1659316637
last_char_rtl = 0
@@ -18969,7 +19013,7 @@ def JM_color_FromSequence(color):
1896919013

1897019014

1897119015
def JM_color_count( pm, clip):
18972-
if g_use_extra:
19016+
if 1 or g_use_extra:
1897319017
return extra.ll_JM_color_count(pm.m_internal, clip)
1897419018

1897519019
rc = dict()
@@ -20469,7 +20513,7 @@ def JM_make_annot_DA(annot, ncol, col, fontname, fontsize):
2046920513

2047020514

2047120515
def JM_make_spanlist(line_dict, line, raw, buff, tp_rect):
20472-
if g_use_extra:
20516+
if 1 or g_use_extra:
2047320517
return extra.JM_make_spanlist(line_dict, line, raw, buff, tp_rect)
2047420518
char_list = None
2047520519
span_list = []
@@ -20682,7 +20726,7 @@ def JM_make_image_block(block, block_dict):
2068220726

2068320727

2068420728
def JM_make_text_block(block, block_dict, raw, buff, tp_rect):
20685-
if g_use_extra:
20729+
if 1 or g_use_extra:
2068620730
return extra.JM_make_text_block(block.m_internal, block_dict, raw, buff.m_internal, tp_rect.m_internal)
2068720731
line_list = []
2068820732
block_rect = mupdf.FzRect(mupdf.FzRect.Fixed_EMPTY)
@@ -20705,8 +20749,8 @@ def JM_make_text_block(block, block_dict, raw, buff, tp_rect):
2070520749

2070620750

2070720751
def JM_make_textpage_dict(tp, page_dict, raw):
20708-
if g_use_extra:
20709-
return extra.JM_make_textpage_dict(tp.m_internal, page_dict, raw)
20752+
if 1 or g_use_extra:
20753+
return extra.JM_make_textpage_dict(tp.m_internal, page_dict, raw, tp._dev_flags)
2071020754
text_buffer = mupdf.fz_new_buffer(128)
2071120755
block_list = []
2071220756
tp_rect = mupdf.FzRect(tp.m_internal.mediabox)
@@ -21356,7 +21400,7 @@ def JM_rotate_page_matrix(page):
2135621400

2135721401

2135821402
def JM_search_stext_page(page, needle):
21359-
if g_use_extra:
21403+
if 1 or g_use_extra:
2136021404
return extra.JM_search_stext_page(page.m_internal, needle)
2136121405

2136221406
rect = mupdf.FzRect(page.m_internal.mediabox)

0 commit comments

Comments
 (0)