Skip to content

Commit

Permalink
Refactoring & fix pre tag nodes
Browse files Browse the repository at this point in the history
  • Loading branch information
python273 committed Nov 2, 2021
1 parent 0fcb991 commit 420b9da
Show file tree
Hide file tree
Showing 3 changed files with 88 additions and 148 deletions.
191 changes: 62 additions & 129 deletions telegraph/utils.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,11 @@
# -*- coding: utf-8 -*-
import re
from html.parser import HTMLParser
from html.entities import name2codepoint
from html import escape

from .exceptions import NotAllowedTag, InvalidHTML

try: # python 3.x
from html.parser import HTMLParser
from html.entities import name2codepoint
from html import escape

basestring = str

except ImportError: # python 2.x
from HTMLParser import HTMLParser
from htmlentitydefs import name2codepoint
from cgi import escape

chr = unichr


RE_WHITESPACE = re.compile(r'(\s+)', re.UNICODE)

Expand Down Expand Up @@ -50,31 +39,49 @@ def __init__(self):
self.current_nodes = self.nodes
self.parent_nodes = []

self.last_text_node = None

self.tags_path = []

def add_str_node(self, s):
if not s:
return

if self.current_nodes and isinstance(self.current_nodes[-1], basestring):
if 'pre' not in self.tags_path: # keep whitespace in <pre>
s = RE_WHITESPACE.sub(' ', s)

if self.last_text_node is None or self.last_text_node.endswith(' '):
s = s.lstrip(' ')

if not s:
self.last_text_node = None
return

self.last_text_node = s

if self.current_nodes and isinstance(self.current_nodes[-1], str):
self.current_nodes[-1] += s
else:
self.current_nodes.append(s)

def handle_starttag(self, tag, attrs_list):
if tag not in ALLOWED_TAGS:
raise NotAllowedTag('%s tag is not allowed' % tag)
raise NotAllowedTag(f'{tag!r} tag is not allowed')

if tag in BLOCK_ELEMENTS:
self.last_text_node = None

node = {'tag': tag}
self.tags_path.append(tag)
self.current_nodes.append(node)

if attrs_list:
attrs = {}
node['attrs'] = attrs

for attr, value in attrs_list:
attrs[attr] = value

node['attrs'] = attrs

self.current_nodes.append(node)

if tag not in VOID_ELEMENTS:
self.parent_nodes.append(self.current_nodes)
self.current_nodes = node['children'] = []
Expand All @@ -84,18 +91,16 @@ def handle_endtag(self, tag):
return

if not len(self.parent_nodes):
raise InvalidHTML('"{}" missing start tag'.format(
tag
))
raise InvalidHTML(f'{tag!r} missing start tag')

self.current_nodes = self.parent_nodes.pop()

last_node = self.current_nodes[-1]

if last_node['tag'] != tag:
raise InvalidHTML('"{}" tag closed instead of "{}"'.format(
tag, last_node['tag']
))
raise InvalidHTML(f'{tag!r} tag closed instead of {last_node["tag"]!r}')

self.tags_path.pop()

if not last_node['children']:
last_node.pop('children')
Expand All @@ -117,128 +122,56 @@ def handle_charref(self, name):
def get_nodes(self):
if self.parent_nodes:
not_closed_tag = self.parent_nodes[-1][-1]['tag']
raise InvalidHTML('"{}" tag is not closed'.format(not_closed_tag))
raise InvalidHTML(f'{not_closed_tag!r} tag is not closed')

return self.nodes


def clear_whitespace_nodes(nodes, last_text_node=None):
"""
:param nodes:
:type nodes: list
:param last_text_node:
:type last_text_node: basestring
:return: list
"""
# TODO: probably possible to move to html parser

stack = []
current_nodes = nodes[:]

new_nodes = []
new_children = new_nodes

while True:
if current_nodes:
node = current_nodes.pop(0)

if type(node) is dict:
is_block_element = node['tag'] in BLOCK_ELEMENTS
if is_block_element:
last_text_node = None

new_children.append(node)

node_children = node.get('children')

if node_children:
stack.append((current_nodes, new_children))
current_nodes = node_children
new_children = []
node['children'] = new_children
else:
node = RE_WHITESPACE.sub(' ', node)

if last_text_node is None or last_text_node.endswith(' '):
node = node.lstrip(' ')

if node:
last_text_node = node
new_children.append(node)
else:
last_text_node = None

if not current_nodes:
if stack:
current_nodes, new_children = stack.pop()
else:
break

return new_nodes, last_text_node


def html_to_nodes(html_content):
parser = HtmlToNodesParser()
parser.feed(html_content)

nodes = parser.get_nodes()
nodes, _ = clear_whitespace_nodes(nodes)
return nodes
return parser.get_nodes()


def nodes_to_html(nodes):
html_content = []
out = []
append = out.append

stack = []
tags_stack = []
current_nodes = nodes[:]
curr = nodes
i = -1

while True:
if current_nodes:
node = current_nodes.pop(0)

if type(node) is dict:
tags_stack.append(node['tag'])

attrs = node.get('attrs')

if attrs:
attrs_str = ['']
i += 1

for attr, value in attrs.items():
attrs_str.append('{}="{}"'.format(attr, escape(value)))
else:
attrs_str = []

html_content.append('<{}{}>'.format(
node['tag'],
' '.join(attrs_str)
))
if i >= len(curr):
if not stack:
break
curr, i = stack.pop()
append(f'</{curr[i]["tag"]}>')
continue

children = node.get('children', [])
stack.append(current_nodes)
current_nodes = children
else:
html_content.append(escape(node))
node = curr[i]

if not current_nodes:
if tags_stack:
closed_tag = tags_stack.pop()
if isinstance(node, str):
append(escape(node))
continue

last_el = html_content[-1]
append(f'<{node["tag"]}')

if closed_tag in VOID_ELEMENTS and \
last_el.startswith('<{}'.format(closed_tag)) and \
not last_el.endswith('/>'):
if node.get('attrs'):
for attr, value in node['attrs'].items():
append(f' {attr}="{escape(value)}"')

html_content[-1] = last_el[:-1] + '/>'
else:
html_content.append('</{}>'.format(closed_tag))
if node.get('children'):
append('>')
stack.append((curr, i))
curr, i = node['children'], -1
continue

if stack:
current_nodes = stack.pop()
else:
break
if node["tag"] in VOID_ELEMENTS:
append('/>')
else:
append(f'></{node["tag"]}>')

return ''.join(html_content)
return ''.join(out)
2 changes: 2 additions & 0 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from . import test_html_converter
from . import test_telegraph
43 changes: 24 additions & 19 deletions tests/test_html_converter.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from unittest import TestCase

from telegraph.exceptions import NotAllowedTag, InvalidHTML
from telegraph.utils import html_to_nodes, nodes_to_html, clear_whitespace_nodes
from telegraph.utils import html_to_nodes, nodes_to_html

HTML_TEST_STR = """
<p>Hello, world!<br/></p>
Expand Down Expand Up @@ -71,6 +71,12 @@ def test_html_to_nodes_multi_line(self):
HTML_MULTI_LINES_NODES_LIST
)

def test_uppercase_tags(self):
self.assertEqual(
html_to_nodes("<P>Hello</P>"),
[{'tag': 'p', 'children': ['Hello']}]
)

def test_html_to_nodes_invalid_html(self):
with self.assertRaises(InvalidHTML):
html_to_nodes('<p><b></p></b>')
Expand Down Expand Up @@ -99,23 +105,11 @@ def test_nodes_to_html_blank(self):
''
)

def test_clear_whitespace_nodes(self):
nodes = [
'\n',
{'tag': 'p', 'children': [
{'tag': 'i', 'children': ['A']},
{'tag': 'b', 'children': [' ']},
{'tag': 'b', 'children': [
'B ',
{'tag': 'i', 'children': ['C']},
{'tag': 'i', 'children': [{'tag': 'b'}]},
' D '
]},
' E '
]},
{'tag': 'p', 'children': [' F ']},
'\n'
]
def test_clear_whitespace(self):
i = (
'\n<p><i>A</i><b> </b><b>B <i>C</i><i><b></b></i>'
' D </b> E </p><p> F </p>\n'
)
expected = [
{'tag': 'p', 'children': [
{'tag': 'i', 'children': ['A']},
Expand All @@ -131,7 +125,18 @@ def test_clear_whitespace_nodes(self):
{'tag': 'p', 'children': ['F ']}
]

self.assertEqual(clear_whitespace_nodes(nodes)[0], expected)
self.assertEqual(html_to_nodes(i), expected)

def test_clear_whitespace_1(self):
x = '\n<p><i>A</i><b> </b><b>B <i>C</i><i><b></b></i> D </b> E </p><p> F </p>\n'
y = '<p><i>A</i><b> </b><b>B <i>C</i><i><b></b></i> D </b>E </p><p>F </p>'
self.assertEqual(nodes_to_html(html_to_nodes(x)), y)

def test_pre_whitespace_preserved(self):
self.assertEqual(
html_to_nodes("<pre>\nhello\nworld</pre>"),
[{'tag': 'pre', 'children': ['\nhello\nworld']}]
)

def test_no_starttag_node(self):
with self.assertRaises(InvalidHTML):
Expand Down

0 comments on commit 420b9da

Please sign in to comment.