Refactoring & fix pre tag nodes

python273 · Nov 2, 2021 · 420b9da · 420b9da
1 parent 0fcb991
commit 420b9da
Show file tree

Hide file tree

Showing 3 changed files with 88 additions and 148 deletions.
diff --git a/telegraph/utils.py b/telegraph/utils.py
@@ -1,22 +1,11 @@
 # -*- coding: utf-8 -*-
 import re
+from html.parser import HTMLParser
+from html.entities import name2codepoint
+from html import escape
 
 from .exceptions import NotAllowedTag, InvalidHTML
 
-try:  # python 3.x
-    from html.parser import HTMLParser
-    from html.entities import name2codepoint
-    from html import escape
-
-    basestring = str
-
-except ImportError:  # python 2.x
-    from HTMLParser import HTMLParser
-    from htmlentitydefs import name2codepoint
-    from cgi import escape
-
-    chr = unichr
-
 
 RE_WHITESPACE = re.compile(r'(\s+)', re.UNICODE)
 
@@ -50,31 +39,49 @@ def __init__(self):
         self.current_nodes = self.nodes
         self.parent_nodes = []
 
+        self.last_text_node = None
+
+        self.tags_path = []
+
     def add_str_node(self, s):
         if not s:
             return
 
-        if self.current_nodes and isinstance(self.current_nodes[-1], basestring):
+        if 'pre' not in self.tags_path:  # keep whitespace in <pre>
+            s = RE_WHITESPACE.sub(' ', s)
+
+            if self.last_text_node is None or self.last_text_node.endswith(' '):
+                s = s.lstrip(' ')
+
+            if not s:
+                self.last_text_node = None
+                return
+
+            self.last_text_node = s
+
+        if self.current_nodes and isinstance(self.current_nodes[-1], str):
             self.current_nodes[-1] += s
         else:
             self.current_nodes.append(s)
 
     def handle_starttag(self, tag, attrs_list):
         if tag not in ALLOWED_TAGS:
-            raise NotAllowedTag('%s tag is not allowed' % tag)
+            raise NotAllowedTag(f'{tag!r} tag is not allowed')
+
+        if tag in BLOCK_ELEMENTS:
+            self.last_text_node = None
 
         node = {'tag': tag}
+        self.tags_path.append(tag)
+        self.current_nodes.append(node)
 
         if attrs_list:
             attrs = {}
+            node['attrs'] = attrs
 
             for attr, value in attrs_list:
                 attrs[attr] = value
 
-            node['attrs'] = attrs
-
-        self.current_nodes.append(node)
-
         if tag not in VOID_ELEMENTS:
             self.parent_nodes.append(self.current_nodes)
             self.current_nodes = node['children'] = []
@@ -84,18 +91,16 @@ def handle_endtag(self, tag):
             return
 
         if not len(self.parent_nodes):
-            raise InvalidHTML('"{}" missing start tag'.format(
-                tag
-            ))
+            raise InvalidHTML(f'{tag!r} missing start tag')
 
         self.current_nodes = self.parent_nodes.pop()
 
         last_node = self.current_nodes[-1]
 
         if last_node['tag'] != tag:
-            raise InvalidHTML('"{}" tag closed instead of "{}"'.format(
-                tag, last_node['tag']
-            ))
+            raise InvalidHTML(f'{tag!r} tag closed instead of {last_node["tag"]!r}')
+
+        self.tags_path.pop()
 
         if not last_node['children']:
             last_node.pop('children')
@@ -117,128 +122,56 @@ def handle_charref(self, name):
     def get_nodes(self):
         if self.parent_nodes:
             not_closed_tag = self.parent_nodes[-1][-1]['tag']
-            raise InvalidHTML('"{}" tag is not closed'.format(not_closed_tag))
+            raise InvalidHTML(f'{not_closed_tag!r} tag is not closed')
 
         return self.nodes
 
 
-def clear_whitespace_nodes(nodes, last_text_node=None):
-    """
-
-    :param nodes:
-    :type nodes: list
-    :param last_text_node:
-    :type last_text_node: basestring
-    :return: list
-    """
-    # TODO: probably possible to move to html parser
-
-    stack = []
-    current_nodes = nodes[:]
-
-    new_nodes = []
-    new_children = new_nodes
-
-    while True:
-        if current_nodes:
-            node = current_nodes.pop(0)
-
-            if type(node) is dict:
-                is_block_element = node['tag'] in BLOCK_ELEMENTS
-                if is_block_element:
-                    last_text_node = None
-
-                new_children.append(node)
-
-                node_children = node.get('children')
-
-                if node_children:
-                    stack.append((current_nodes, new_children))
-                    current_nodes = node_children
-                    new_children = []
-                    node['children'] = new_children
-            else:
-                node = RE_WHITESPACE.sub(' ', node)
-
-                if last_text_node is None or last_text_node.endswith(' '):
-                    node = node.lstrip(' ')
-
-                if node:
-                    last_text_node = node
-                    new_children.append(node)
-                else:
-                    last_text_node = None
-
-        if not current_nodes:
-            if stack:
-                current_nodes, new_children = stack.pop()
-            else:
-                break
-
-    return new_nodes, last_text_node
-
-
 def html_to_nodes(html_content):
     parser = HtmlToNodesParser()
     parser.feed(html_content)
-
-    nodes = parser.get_nodes()
-    nodes, _ = clear_whitespace_nodes(nodes)
-    return nodes
+    return parser.get_nodes()
 
 
 def nodes_to_html(nodes):
-    html_content = []
+    out = []
+    append = out.append
 
     stack = []
-    tags_stack = []
-    current_nodes = nodes[:]
+    curr = nodes
+    i = -1
 
     while True:
-        if current_nodes:
-            node = current_nodes.pop(0)
-
-            if type(node) is dict:
-                tags_stack.append(node['tag'])
-
-                attrs = node.get('attrs')
-
-                if attrs:
-                    attrs_str = ['']
+        i += 1
 
-                    for attr, value in attrs.items():
-                        attrs_str.append('{}="{}"'.format(attr, escape(value)))
-                else:
-                    attrs_str = []
-
-                html_content.append('<{}{}>'.format(
-                    node['tag'],
-                    ' '.join(attrs_str)
-                ))
+        if i >= len(curr):
+            if not stack:
+                break
+            curr, i = stack.pop()
+            append(f'</{curr[i]["tag"]}>')
+            continue
 
-                children = node.get('children', [])
-                stack.append(current_nodes)
-                current_nodes = children
-            else:
-                html_content.append(escape(node))
+        node = curr[i]
 
-        if not current_nodes:
-            if tags_stack:
-                closed_tag = tags_stack.pop()
+        if isinstance(node, str):
+            append(escape(node))
+            continue
 
-                last_el = html_content[-1]
+        append(f'<{node["tag"]}')
 
-                if closed_tag in VOID_ELEMENTS and \
-                   last_el.startswith('<{}'.format(closed_tag)) and \
-                   not last_el.endswith('/>'):
+        if node.get('attrs'):
+            for attr, value in node['attrs'].items():
+                append(f' {attr}="{escape(value)}"')
 
-                    html_content[-1] = last_el[:-1] + '/>'
-                else:
-                    html_content.append('</{}>'.format(closed_tag))
+        if node.get('children'):
+            append('>')
+            stack.append((curr, i))
+            curr, i = node['children'], -1
+            continue
 
-            if stack:
-                current_nodes = stack.pop()
-            else:
-                break
+        if node["tag"] in VOID_ELEMENTS:
+            append('/>')
+        else:
+            append(f'></{node["tag"]}>')
 
-    return ''.join(html_content)
+    return ''.join(out)
diff --git a/tests/__init__.py b/tests/__init__.py
@@ -0,0 +1,2 @@
+from . import test_html_converter
+from . import test_telegraph
diff --git a/tests/test_html_converter.py b/tests/test_html_converter.py
@@ -1,7 +1,7 @@
 from unittest import TestCase
 
 from telegraph.exceptions import NotAllowedTag, InvalidHTML
-from telegraph.utils import html_to_nodes, nodes_to_html, clear_whitespace_nodes
+from telegraph.utils import html_to_nodes, nodes_to_html
 
 HTML_TEST_STR = """
 <p>Hello, world!<br/></p>
@@ -71,6 +71,12 @@ def test_html_to_nodes_multi_line(self):
             HTML_MULTI_LINES_NODES_LIST
         )
 
+    def test_uppercase_tags(self):
+        self.assertEqual(
+            html_to_nodes("<P>Hello</P>"),
+            [{'tag': 'p', 'children': ['Hello']}]
+        )
+
     def test_html_to_nodes_invalid_html(self):
         with self.assertRaises(InvalidHTML):
             html_to_nodes('<p><b></p></b>')
@@ -99,23 +105,11 @@ def test_nodes_to_html_blank(self):
             ''
         )
 
-    def test_clear_whitespace_nodes(self):
-        nodes = [
-            '\n',
-            {'tag': 'p', 'children': [
-                {'tag': 'i', 'children': ['A']},
-                {'tag': 'b', 'children': [' ']},
-                {'tag': 'b', 'children': [
-                    'B ',
-                    {'tag': 'i', 'children': ['C']},
-                    {'tag': 'i', 'children': [{'tag': 'b'}]},
-                    ' D '
-                ]},
-                ' E '
-            ]},
-            {'tag': 'p', 'children': [' F ']},
-            '\n'
-        ]
+    def test_clear_whitespace(self):
+        i = (
+            '\n<p><i>A</i><b> </b><b>B <i>C</i><i><b></b></i>'
+            ' D </b> E </p><p> F </p>\n'
+        )
         expected = [
             {'tag': 'p', 'children': [
                 {'tag': 'i', 'children': ['A']},
@@ -131,7 +125,18 @@ def test_clear_whitespace_nodes(self):
             {'tag': 'p', 'children': ['F ']}
         ]
 
-        self.assertEqual(clear_whitespace_nodes(nodes)[0], expected)
+        self.assertEqual(html_to_nodes(i), expected)
+
+    def test_clear_whitespace_1(self):
+        x = '\n<p><i>A</i><b> </b><b>B <i>C</i><i><b></b></i> D </b> E </p><p> F </p>\n'
+        y = '<p><i>A</i><b> </b><b>B <i>C</i><i><b></b></i> D </b>E </p><p>F </p>'
+        self.assertEqual(nodes_to_html(html_to_nodes(x)), y)
+
+    def test_pre_whitespace_preserved(self):
+        self.assertEqual(
+            html_to_nodes("<pre>\nhello\nworld</pre>"),
+            [{'tag': 'pre', 'children': ['\nhello\nworld']}]
+        )
 
     def test_no_starttag_node(self):
         with self.assertRaises(InvalidHTML):
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from . import test_html_converter
		from . import test_telegraph