2828import zipfile
2929
3030from . import extra
31- 
31+ import importlib.util 
3232
3333# Set up g_out_log and g_out_message from environment variables.
3434#
@@ -333,6 +333,29 @@ def __init__(self):
333333
334334_globals = _Globals()
335335
336+ _get_layout: typing.Optional[typing.Callable] = None
337+ _recommend_layout = True
338+ 
339+ 
340+ def no_recommend_layout():
341+     global _recommend_layout
342+     _recommend_layout = False
343+ 
344+ 
345+ def _warn_layout_once():
346+     msg="""Consider using the pymupdf_layout package for a greatly improved page layout analysis."""
347+ 
348+     global _recommend_layout
349+     if (
350+         1
351+         and _recommend_layout
352+         and not callable(_get_layout)
353+         and os.getenv("PYMUPDF_SUGGEST_LAYOUT_ANALYZER") != "0"
354+         and not importlib.util.find_spec("pymupdf.layout")
355+     ):
356+         print(msg)
357+         _recommend_layout = False
358+ 
336359
337360# Optionally use MuPDF via cppyy bindings; experimental and not tested recently
338361# as of 2023-01-20 11:51:40
@@ -1054,6 +1077,7 @@ def get_textpage(self, clip=None, flags=0):
10541077        annot = self.this
10551078        stextpage = mupdf.FzStextPage(annot, options)
10561079        ret = TextPage(stextpage)
1080+         ret._dev_flags = flags
10571081        p = self.get_parent()
10581082        if isinstance(p, weakref.ProxyType):
10591083            ret.parent = p
@@ -2784,6 +2808,7 @@ def get_textpage(self, flags=3):
27842808        stext_options.flags = flags
27852809        val = mupdf.FzStextPage(self.this, stext_options)
27862810        val.thisown = True
2811+         val._dev_flags = flags
27872812        return val
27882813
27892814    @property
@@ -9952,9 +9977,10 @@ def _get_resource_properties(self):
99529977        return rc
99539978
99549979    def _get_textpage(self, clip=None, flags=0, matrix=None):
9955-         if g_use_extra:
9980+         if 1 or  g_use_extra:
99569981            ll_tpage = extra.page_get_textpage(self.this, clip, flags, matrix)
99579982            tpage = mupdf.FzStextPage(ll_tpage)
9983+             tpage._dev_flags = flags
99589984            return tpage
99599985        page = self.this
99609986        options = mupdf.FzStextOptions(flags)
@@ -10781,6 +10807,20 @@ def clip_to_rect(self, rect):
1078110807        pclip = JM_rect_from_py(clip)
1078210808        mupdf.pdf_clip_page(pdfpage, pclip)
1078310809
10810+     def get_layout(self, vertical_gap=12):
10811+         """Try to access layout information."""
10812+ 
10813+         if self.layout_information is not None:
10814+             # layout information already present
10815+             return
10816+ 
10817+         if not _get_layout:
10818+             # no layout information available
10819+             return
10820+ 
10821+         layout_info = _get_layout(self)
10822+         self.layout_information = layout_info
10823+ 
1078410824    @property
1078510825    def artbox(self):
1078610826        """The ArtBox"""
@@ -11432,7 +11472,7 @@ def get_cdrawings(self, extended=None, callback=None, method=None):
1143211472        assert isinstance(page, mupdf.FzPage), f'{self.this=}'
1143311473        clips = True if extended else False
1143411474        prect = mupdf.fz_bound_page(page)
11435-         if g_use_extra:
11475+         if 1 or  g_use_extra:
1143611476            rc = extra.get_cdrawings(page, extended, callback, method)
1143711477        else:
1143811478            rc = list()
@@ -12146,6 +12186,7 @@ def get_textpage(self, clip: rect_like = None, flags: int = 0, matrix=None) -> "
1214612186            if old_rotation != 0:
1214712187                self.set_rotation(old_rotation)
1214812188        textpage = TextPage(textpage)
12189+         textpage._dev_flags = flags
1214912190        textpage.parent = weakref.proxy(self)
1215012191        return textpage
1215112192
@@ -12157,7 +12198,7 @@ def get_texttrace(self):
1215712198            self.set_rotation(0)
1215812199        page = self.this
1215912200        rc = []
12160-         if g_use_extra:
12201+         if 1 or  g_use_extra:
1216112202            dev = extra.JM_new_texttrace_device(rc)
1216212203        else:
1216312204            dev = JM_new_texttrace_device(rc)
@@ -13206,6 +13247,9 @@ def xref(self):
1320613247
1320713248    rect = property(bound, doc="page rectangle")
1320813249
13250+     # any result of layout analysis is stored here
13251+     layout_information: typing.Optional[typing.List[tuple]] = None
13252+ 
1320913253
1321013254class Pixmap:
1321113255
@@ -16391,7 +16435,7 @@ def _textpage_dict(self, raw=False):
1639116435
1639216436    def extractBLOCKS(self):
1639316437        """Return a list with text block information."""
16394-         if g_use_extra:
16438+         if 1 or  g_use_extra:
1639516439            return extra.extractBLOCKS(self.this)
1639616440        block_n = -1
1639716441        this_tpage = self.this
@@ -16587,7 +16631,7 @@ def extractTextbox(self, rect):
1658716631
1658816632    def extractWORDS(self, delimiters=None):
1658916633        """Return a list with text word information."""
16590-         if g_use_extra:
16634+         if 1 or  g_use_extra:
1659116635            return extra.extractWORDS(self.this, delimiters)
1659216636        buflen = 0
1659316637        last_char_rtl = 0
@@ -18969,7 +19013,7 @@ def JM_color_FromSequence(color):
1896919013
1897019014
1897119015def JM_color_count( pm, clip):
18972-     if g_use_extra:
19016+     if 1 or  g_use_extra:
1897319017        return extra.ll_JM_color_count(pm.m_internal, clip)
1897419018
1897519019    rc = dict()
@@ -20469,7 +20513,7 @@ def JM_make_annot_DA(annot, ncol, col, fontname, fontsize):
2046920513
2047020514
2047120515def JM_make_spanlist(line_dict, line, raw, buff, tp_rect):
20472-     if g_use_extra:
20516+     if 1 or  g_use_extra:
2047320517        return extra.JM_make_spanlist(line_dict, line, raw, buff, tp_rect)
2047420518    char_list = None
2047520519    span_list = []
@@ -20682,7 +20726,7 @@ def JM_make_image_block(block, block_dict):
2068220726
2068320727
2068420728def JM_make_text_block(block, block_dict, raw, buff, tp_rect):
20685-     if g_use_extra:
20729+     if 1 or  g_use_extra:
2068620730        return extra.JM_make_text_block(block.m_internal, block_dict, raw, buff.m_internal, tp_rect.m_internal)
2068720731    line_list = []
2068820732    block_rect = mupdf.FzRect(mupdf.FzRect.Fixed_EMPTY)
@@ -20705,8 +20749,8 @@ def JM_make_text_block(block, block_dict, raw, buff, tp_rect):
2070520749
2070620750
2070720751def JM_make_textpage_dict(tp, page_dict, raw):
20708-     if g_use_extra:
20709-         return extra.JM_make_textpage_dict(tp.m_internal, page_dict, raw)
20752+     if 1 or  g_use_extra:
20753+         return extra.JM_make_textpage_dict(tp.m_internal, page_dict, raw, tp._dev_flags )
2071020754    text_buffer = mupdf.fz_new_buffer(128)
2071120755    block_list = []
2071220756    tp_rect = mupdf.FzRect(tp.m_internal.mediabox)
@@ -21356,7 +21400,7 @@ def JM_rotate_page_matrix(page):
2135621400
2135721401
2135821402def JM_search_stext_page(page, needle):
21359-     if g_use_extra:
21403+     if 1 or  g_use_extra:
2136021404        return extra.JM_search_stext_page(page.m_internal, needle)
2136121405
2136221406    rect = mupdf.FzRect(page.m_internal.mediabox)
0 commit comments