From 4d0a29e491da5d2ea14b6aea7a079a14f71e6fd6 Mon Sep 17 00:00:00 2001
From: Huon Wilson <Huon.Wilson@data61.csiro.au>
Date: Wed, 6 May 2020 12:35:23 +1000
Subject: [PATCH] Allow using alternative markdown renderers

Shelling out to pandoc twice for each markdown cell can be very slow. Using a
single-process/in-memory renderer for any markdown can be noticably faster.

For instance, builds of https://github.com/stellargraph/stellargraph's docs go
from ~2 minutes to 40 seconds.
---
 doc/usage.ipynb |  12 +++
 src/nbsphinx.py | 227 +++++++++++++++++++++++++++++-------------------
 2 files changed, 149 insertions(+), 90 deletions(-)

diff --git a/doc/usage.ipynb b/doc/usage.ipynb
index 31125a75..60a8799b 100644
--- a/doc/usage.ipynb
+++ b/doc/usage.ipynb
@@ -323,6 +323,18 @@
     "See [Configuring the Kernels](configuring-kernels.ipynb#Kernel-Name)."
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### `nbsphinx_markdown_renderer`\n",
+    "\n",
+    "Use a particular way to renderer Markdown. Possible values:\n",
+    "\n",
+    "- `pandoc` (default): a featureful renderer using [Pandoc](https://pandoc.org)\n",
+    "- `commonmark`: a faster renderer that doesn't support some Pandoc extensions, using [the commonmark.py library](https://commonmarkpy.readthedocs.io)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
diff --git a/src/nbsphinx.py b/src/nbsphinx.py
index 9f4ea61d..4358bb81 100644
--- a/src/nbsphinx.py
+++ b/src/nbsphinx.py
@@ -717,7 +717,8 @@ class Exporter(nbconvert.RSTExporter):
     """
 
     def __init__(self, execute='auto', kernel_name='', execute_arguments=[],
-                 allow_errors=False, timeout=None, codecell_lexer='none'):
+                 allow_errors=False, timeout=None, codecell_lexer='none',
+                 markdown_renderer='pandoc'):
         """Initialize the Exporter."""
 
         # NB: The following stateful Jinja filters are a hack until
@@ -744,6 +745,14 @@ def replace_attachments(text):
             del attachment_storage[:]
             return text
 
+        renderer_cls = MARKDOWN_RENDERERS.get(markdown_renderer)
+        if renderer_cls is None:
+            known = ", ".join(MARKDOWN_RENDERERS.keys())
+            raise ValueError(
+                'invalid renderer option: {!r} (known renderers: {})'.format(markdown_renderer, known)
+            )
+        renderer = renderer_cls()
+
         self._execute = execute
         self._kernel_name = kernel_name
         self._execute_arguments = execute_arguments
@@ -759,8 +768,8 @@ def replace_attachments(text):
                 'RegexRemovePreprocessor': {'enabled': False},
             }),
             filters={
-                'convert_pandoc': convert_pandoc,
-                'markdown2rst': markdown2rst,
+                'convert_pandoc': renderer.convert_pandoc,
+                'markdown2rst': renderer.markdown2rst,
                 'get_empty_lines': _get_empty_lines,
                 'extract_gallery_or_toctree': _extract_gallery_or_toctree,
                 'save_attachments': save_attachments,
@@ -986,6 +995,7 @@ def parse(self, inputstring, document):
             allow_errors=env.config.nbsphinx_allow_errors,
             timeout=env.config.nbsphinx_timeout,
             codecell_lexer=env.config.nbsphinx_codecell_lexer,
+            markdown_renderer=env.config.nbsphinx_markdown_renderer
         )
 
         try:
@@ -1208,18 +1218,6 @@ def run(self):
         return [gallerytoc]
 
 
-def convert_pandoc(text, from_format, to_format):
-    """Simple wrapper for markdown2rst.
-
-    In nbconvert version 5.0, the use of markdown2rst in the RST
-    template was replaced by the new filter function convert_pandoc.
-
-    """
-    if from_format != 'markdown' and to_format != 'rst':
-        raise ValueError('Unsupported conversion')
-    return markdown2rst(text)
-
-
 class CitationParser(html.parser.HTMLParser):
 
     def handle_starttag(self, tag, attrs):
@@ -1285,88 +1283,136 @@ def reset(self):
         self.obj = {}
 
 
-def markdown2rst(text):
-    """Convert a Markdown string to reST via pandoc.
+class MarkdownRenderer:
+    def convert_pandoc(self, text, from_format, to_format):
+        """Simple wrapper for markdown2rst.
 
-    This is very similar to nbconvert.filters.markdown.markdown2rst(),
-    except that it uses a pandoc filter to convert raw LaTeX blocks to
-    "math" directives (instead of "raw:: latex" directives).
+        In nbconvert version 5.0, the use of markdown2rst in the RST
+        template was replaced by the new filter function convert_pandoc.
+
+        """
+        if from_format != 'markdown' and to_format != 'rst':
+            raise ValueError('Unsupported conversion')
+
+        return self.markdown2rst(text)
 
-    NB: At some point, pandoc changed its behavior!  In former times,
-    it converted LaTeX math environments to RawBlock ("latex"), at some
-    later point this was changed to RawInline ("tex").
-    Either way, we convert it to Math/DisplayMath.
 
+class CommonMarkMarkdownRenderer(MarkdownRenderer):
+    """ Convert a Markdown string to reST via commonmark.
     """
+    def __init__(self):
+        try:
+            import commonmark
+        except ModuleNotFoundError as e:
+            raise ModuleNotFoundError(e.msg + '. Please install it (such as with \'pip install commonmark\') to use \'nbsphinx_markdown_renderer = "commonmark"\'') from e
+
+        class AnonymousLinks(commonmark.ReStructuredTextRenderer):
+            def link(self, node, entering):
+                if entering:
+                    self.out('`')
+                else:
+                    # use anonymous hyperlinks `<...>`__ instead of named ones _
+                    self.out(' <%s>`__' % node.destination)
+
+        self._parser = commonmark.Parser()
+        self._renderer = AnonymousLinks()
 
-    def parse_citation(obj):
-        p = CitationParser()
-        p.feed(obj['c'][1])
-        p.close()
-        return p
-
-    def parse_img(obj):
-        p = ImgParser()
-        p.feed(obj['c'][1])
-        p.close()
-        return p
-
-    def object_hook(obj):
-        if object_hook.open_cite_tag:
-            if obj.get('t') == 'RawInline' and obj['c'][0] == 'html':
+    def markdown2rst(self, text):
+        """
+        This is very similar to nbconvert.filters.markdown.markdown2rst(), except
+        that it uses commonmark (with some extensions) instead of Pandoc, for speed.
+        """
+
+        ast = self._parser.parse(text)
+        return self._renderer.render(ast)
+
+
+class PandocMarkdownRenderer(MarkdownRenderer):
+    def markdown2rst(self, text):
+        """Convert a Markdown string to reST via pandoc.
+
+        This is very similar to nbconvert.filters.markdown.markdown2rst(),
+        except that it uses a pandoc filter to convert raw LaTeX blocks to
+        "math" directives (instead of "raw:: latex" directives).
+
+        NB: At some point, pandoc changed its behavior!  In former times,
+        it converted LaTeX math environments to RawBlock ("latex"), at some
+        later point this was changed to RawInline ("tex").
+        Either way, we convert it to Math/DisplayMath.
+
+        """
+
+        def parse_citation(obj):
+            p = CitationParser()
+            p.feed(obj['c'][1])
+            p.close()
+            return p
+
+        def parse_img(obj):
+            p = ImgParser()
+            p.feed(obj['c'][1])
+            p.close()
+            return p
+
+        def object_hook(obj):
+            if object_hook.open_cite_tag:
+                if obj.get('t') == 'RawInline' and obj['c'][0] == 'html':
+                    p = parse_citation(obj)
+                    if p.endtag == object_hook.open_cite_tag:
+                        object_hook.open_cite_tag = ''
+                return {'t': 'Str', 'c': ''}  # Object is replaced by empty string
+
+            if obj.get('t') == 'RawBlock' and obj['c'][0] == 'latex':
+                obj['t'] = 'Para'
+                obj['c'] = [{
+                    't': 'Math',
+                    'c': [
+                        {'t': 'DisplayMath', 'c': []},
+                        # Special marker characters are removed below:
+                        '\x0e:nowrap:\x0f\n\n' + obj['c'][1],
+                    ]
+                }]
+            elif obj.get('t') == 'RawInline' and obj['c'][0] == 'tex':
+                obj = {'t': 'RawInline',
+                       'c': ['rst', ':nbsphinx-math:`{}`'.format(obj['c'][1])]}
+            elif obj.get('t') == 'RawInline' and obj['c'][0] == 'html':
                 p = parse_citation(obj)
-                if p.endtag == object_hook.open_cite_tag:
-                    object_hook.open_cite_tag = ''
-            return {'t': 'Str', 'c': ''}  # Object is replaced by empty string
-
-        if obj.get('t') == 'RawBlock' and obj['c'][0] == 'latex':
-            obj['t'] = 'Para'
-            obj['c'] = [{
-                't': 'Math',
-                'c': [
-                    {'t': 'DisplayMath', 'c': []},
-                    # Special marker characters are removed below:
-                    '\x0e:nowrap:\x0f\n\n' + obj['c'][1],
-                ]
-            }]
-        elif obj.get('t') == 'RawInline' and obj['c'][0] == 'tex':
-            obj = {'t': 'RawInline',
-                   'c': ['rst', ':nbsphinx-math:`{}`'.format(obj['c'][1])]}
-        elif obj.get('t') == 'RawInline' and obj['c'][0] == 'html':
-            p = parse_citation(obj)
-            if p.starttag:
-                object_hook.open_cite_tag = p.starttag
-            if p.cite:
-                obj = {'t': 'RawInline', 'c': ['rst', p.cite]}
-            if not p.starttag and not p.cite:
-                p = parse_img(obj)
-                if p.obj:
-                    obj = p.obj
-                    object_hook.image_definitions.append(p.definition)
-        return obj
-
-    object_hook.open_cite_tag = ''
-    object_hook.image_definitions = []
-
-    def filter_func(text):
-        json_data = json.loads(text, object_hook=object_hook)
-        return json.dumps(json_data)
-
-    input_format = 'markdown'
-    input_format += '-implicit_figures'
-    v = nbconvert.utils.pandoc.get_pandoc_version()
-    if nbconvert.utils.version.check_version(v, '1.13'):
-        input_format += '-native_divs+raw_html'
+                if p.starttag:
+                    object_hook.open_cite_tag = p.starttag
+                if p.cite:
+                    obj = {'t': 'RawInline', 'c': ['rst', p.cite]}
+                if not p.starttag and not p.cite:
+                    p = parse_img(obj)
+                    if p.obj:
+                        obj = p.obj
+                        object_hook.image_definitions.append(p.definition)
+            return obj
+
+        object_hook.open_cite_tag = ''
+        object_hook.image_definitions = []
+
+        def filter_func(text):
+            json_data = json.loads(text, object_hook=object_hook)
+            return json.dumps(json_data)
+
+        input_format = 'markdown'
+        input_format += '-implicit_figures'
+        v = nbconvert.utils.pandoc.get_pandoc_version()
+        if nbconvert.utils.version.check_version(v, '1.13'):
+            input_format += '-native_divs+raw_html'
+
+        rststring = pandoc(text, input_format, 'rst', filter_func=filter_func)
+        rststring = re.sub(
+            r'^\n( *)\x0e:nowrap:\x0f$',
+            r'\1:nowrap:',
+            rststring,
+            flags=re.MULTILINE)
+        rststring += '\n\n'
+        rststring += '\n'.join(object_hook.image_definitions)
+        return rststring
+
+MARKDOWN_RENDERERS = {"pandoc": PandocMarkdownRenderer, "commonmark": CommonMarkMarkdownRenderer}
 
-    rststring = pandoc(text, input_format, 'rst', filter_func=filter_func)
-    rststring = re.sub(
-        r'^\n( *)\x0e:nowrap:\x0f$',
-        r'\1:nowrap:',
-        rststring,
-        flags=re.MULTILINE)
-    rststring += '\n\n'
-    rststring += '\n'.join(object_hook.image_definitions)
-    return rststring
 
 
 def pandoc(source, fmt, to, filter_func=None):
@@ -2161,6 +2207,7 @@ def setup(app):
     app.add_config_value('nbsphinx_widgets_path', None, rebuild='html')
     app.add_config_value('nbsphinx_widgets_options', {}, rebuild='html')
     app.add_config_value('nbsphinx_thumbnails', {}, rebuild='html')
+    app.add_config_value('nbsphinx_markdown_renderer', 'pandoc', rebuild='env')
 
     app.add_directive('nbinput', NbInput)
     app.add_directive('nboutput', NbOutput)