Skip to content

Commit e935ce8

Browse files
committed
Merge branch 'develop'
2 parents b5c724a + fe8a821 commit e935ce8

9 files changed

+233
-119
lines changed

README.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -128,9 +128,9 @@ escape_underscores
128128
Defaults to ``True``.
129129

130130
escape_misc
131-
If set to ``False``, do not escape miscellaneous punctuation characters
131+
If set to ``True``, escape miscellaneous punctuation characters
132132
that sometimes have Markdown significance in text.
133-
Defaults to ``True``.
133+
Defaults to ``False``.
134134

135135
keep_inline_images_in
136136
Images are converted to their alt-text when the images are located inside

markdownify/__init__.py

+105-48
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77
convert_heading_re = re.compile(r'convert_h(\d+)')
88
line_beginning_re = re.compile(r'^', re.MULTILINE)
99
whitespace_re = re.compile(r'[\t ]+')
10-
all_whitespace_re = re.compile(r'[\s]+')
10+
all_whitespace_re = re.compile(r'[\t \r\n]+')
11+
newline_whitespace_re = re.compile(r'[\t \r\n]*[\r\n][\t \r\n]*')
1112
html_heading_re = re.compile(r'h[1-6]')
1213

1314

@@ -66,6 +67,23 @@ def _todict(obj):
6667
return dict((k, getattr(obj, k)) for k in dir(obj) if not k.startswith('_'))
6768

6869

70+
def should_remove_whitespace_inside(el):
71+
"""Return to remove whitespace immediately inside a block-level element."""
72+
if not el or not el.name:
73+
return False
74+
if html_heading_re.match(el.name) is not None:
75+
return True
76+
return el.name in ('p', 'blockquote',
77+
'ol', 'ul', 'li',
78+
'table', 'thead', 'tbody', 'tfoot',
79+
'tr', 'td', 'th')
80+
81+
82+
def should_remove_whitespace_outside(el):
83+
"""Return to remove whitespace immediately outside a block-level element."""
84+
return should_remove_whitespace_inside(el) or (el and el.name == 'pre')
85+
86+
6987
class MarkdownConverter(object):
7088
class DefaultOptions:
7189
autolinks = True
@@ -76,7 +94,7 @@ class DefaultOptions:
7694
default_title = False
7795
escape_asterisks = True
7896
escape_underscores = True
79-
escape_misc = True
97+
escape_misc = False
8098
heading_style = UNDERLINED
8199
keep_inline_images_in = []
82100
newline_style = SPACES
@@ -119,27 +137,23 @@ def process_tag(self, node, convert_as_inline, children_only=False):
119137
if not children_only and (isHeading or isCell):
120138
convert_children_as_inline = True
121139

122-
# Remove whitespace-only textnodes in purely nested nodes
123-
def is_nested_node(el):
124-
return el and el.name in ['ol', 'ul', 'li',
125-
'table', 'thead', 'tbody', 'tfoot',
126-
'tr', 'td', 'th']
127-
128-
if is_nested_node(node):
129-
for el in node.children:
130-
# Only extract (remove) whitespace-only text node if any of the
131-
# conditions is true:
132-
# - el is the first element in its parent
133-
# - el is the last element in its parent
134-
# - el is adjacent to an nested node
135-
can_extract = (not el.previous_sibling
136-
or not el.next_sibling
137-
or is_nested_node(el.previous_sibling)
138-
or is_nested_node(el.next_sibling))
139-
if (isinstance(el, NavigableString)
140-
and six.text_type(el).strip() == ''
141-
and can_extract):
142-
el.extract()
140+
# Remove whitespace-only textnodes just before, after or
141+
# inside block-level elements.
142+
should_remove_inside = should_remove_whitespace_inside(node)
143+
for el in node.children:
144+
# Only extract (remove) whitespace-only text node if any of the
145+
# conditions is true:
146+
# - el is the first element in its parent (block-level)
147+
# - el is the last element in its parent (block-level)
148+
# - el is adjacent to a block-level node
149+
can_extract = (should_remove_inside and (not el.previous_sibling
150+
or not el.next_sibling)
151+
or should_remove_whitespace_outside(el.previous_sibling)
152+
or should_remove_whitespace_outside(el.next_sibling))
153+
if (isinstance(el, NavigableString)
154+
and six.text_type(el).strip() == ''
155+
and can_extract):
156+
el.extract()
143157

144158
# Convert the children first
145159
for el in node.children:
@@ -148,7 +162,13 @@ def is_nested_node(el):
148162
elif isinstance(el, NavigableString):
149163
text += self.process_text(el)
150164
else:
151-
text += self.process_tag(el, convert_children_as_inline)
165+
text_strip = text.rstrip('\n')
166+
newlines_left = len(text) - len(text_strip)
167+
next_text = self.process_tag(el, convert_children_as_inline)
168+
next_text_strip = next_text.lstrip('\n')
169+
newlines_right = len(next_text) - len(next_text_strip)
170+
newlines = '\n' * max(newlines_left, newlines_right)
171+
text = text_strip + newlines + next_text_strip
152172

153173
if not children_only:
154174
convert_fn = getattr(self, 'convert_%s' % node.name, None)
@@ -162,18 +182,26 @@ def process_text(self, el):
162182

163183
# normalize whitespace if we're not inside a preformatted element
164184
if not el.find_parent('pre'):
165-
text = whitespace_re.sub(' ', text)
185+
if self.options['wrap']:
186+
text = all_whitespace_re.sub(' ', text)
187+
else:
188+
text = newline_whitespace_re.sub('\n', text)
189+
text = whitespace_re.sub(' ', text)
166190

167191
# escape special characters if we're not inside a preformatted or code element
168192
if not el.find_parent(['pre', 'code', 'kbd', 'samp']):
169193
text = self.escape(text)
170194

171-
# remove trailing whitespaces if any of the following condition is true:
172-
# - current text node is the last node in li
173-
# - current text node is followed by an embedded list
174-
if (el.parent.name == 'li'
175-
and (not el.next_sibling
176-
or el.next_sibling.name in ['ul', 'ol'])):
195+
# remove leading whitespace at the start or just after a
196+
# block-level element; remove traliing whitespace at the end
197+
# or just before a block-level element.
198+
if (should_remove_whitespace_outside(el.previous_sibling)
199+
or (should_remove_whitespace_inside(el.parent)
200+
and not el.previous_sibling)):
201+
text = text.lstrip()
202+
if (should_remove_whitespace_outside(el.next_sibling)
203+
or (should_remove_whitespace_inside(el.parent)
204+
and not el.next_sibling)):
177205
text = text.rstrip()
178206

179207
return text
@@ -208,20 +236,32 @@ def escape(self, text):
208236
if not text:
209237
return ''
210238
if self.options['escape_misc']:
211-
text = re.sub(r'([\\&<`[>~#=+|-])', r'\\\1', text)
212-
text = re.sub(r'([0-9])([.)])', r'\1\\\2', text)
239+
text = re.sub(r'([\\&<`[>~=+|])', r'\\\1', text)
240+
# A sequence of one or more consecutive '-', preceded and
241+
# followed by whitespace or start/end of fragment, might
242+
# be confused with an underline of a header, or with a
243+
# list marker.
244+
text = re.sub(r'(\s|^)(-+(?:\s|$))', r'\1\\\2', text)
245+
# A sequence of up to six consecutive '#', preceded and
246+
# followed by whitespace or start/end of fragment, might
247+
# be confused with an ATX heading.
248+
text = re.sub(r'(\s|^)(#{1,6}(?:\s|$))', r'\1\\\2', text)
249+
# '.' or ')' preceded by up to nine digits might be
250+
# confused with a list item.
251+
text = re.sub(r'((?:\s|^)[0-9]{1,9})([.)](?:\s|$))', r'\1\\\2',
252+
text)
213253
if self.options['escape_asterisks']:
214254
text = text.replace('*', r'\*')
215255
if self.options['escape_underscores']:
216256
text = text.replace('_', r'\_')
217257
return text
218258

219-
def indent(self, text, level):
220-
return line_beginning_re.sub('\t' * level, text) if text else ''
259+
def indent(self, text, columns):
260+
return line_beginning_re.sub(' ' * columns, text) if text else ''
221261

222262
def underline(self, text, pad_char):
223263
text = (text or '').rstrip()
224-
return '%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''
264+
return '\n\n%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''
225265

226266
def convert_a(self, el, text, convert_as_inline):
227267
prefix, suffix, text = chomp(text)
@@ -246,7 +286,7 @@ def convert_a(self, el, text, convert_as_inline):
246286
def convert_blockquote(self, el, text, convert_as_inline):
247287

248288
if convert_as_inline:
249-
return text
289+
return ' ' + text.strip() + ' '
250290

251291
return '\n' + (line_beginning_re.sub('> ', text.strip()) + '\n\n') if text else ''
252292

@@ -280,10 +320,11 @@ def convert_hn(self, n, el, text, convert_as_inline):
280320
if style == UNDERLINED and n <= 2:
281321
line = '=' if n == 1 else '-'
282322
return self.underline(text, line)
323+
text = all_whitespace_re.sub(' ', text)
283324
hashes = '#' * n
284325
if style == ATX_CLOSED:
285-
return '%s %s %s\n\n' % (hashes, text, hashes)
286-
return '%s %s\n\n' % (hashes, text)
326+
return '\n%s %s %s\n\n' % (hashes, text, hashes)
327+
return '\n%s %s\n\n' % (hashes, text)
287328

288329
def convert_hr(self, el, text, convert_as_inline):
289330
return '\n\n---\n\n'
@@ -317,8 +358,8 @@ def convert_list(self, el, text, convert_as_inline):
317358
el = el.parent
318359
if nested:
319360
# remove trailing newline if nested
320-
return '\n' + self.indent(text, 1).rstrip()
321-
return text + ('\n' if before_paragraph else '')
361+
return '\n' + text.rstrip()
362+
return '\n\n' + text + ('\n' if before_paragraph else '')
322363

323364
convert_ul = convert_list
324365
convert_ol = convert_list
@@ -339,17 +380,33 @@ def convert_li(self, el, text, convert_as_inline):
339380
el = el.parent
340381
bullets = self.options['bullets']
341382
bullet = bullets[depth % len(bullets)]
342-
return '%s %s\n' % (bullet, (text or '').strip())
383+
bullet = bullet + ' '
384+
text = (text or '').strip()
385+
text = self.indent(text, len(bullet))
386+
if text:
387+
text = bullet + text[len(bullet):]
388+
return '%s\n' % text
343389

344390
def convert_p(self, el, text, convert_as_inline):
345391
if convert_as_inline:
346-
return text
392+
return ' ' + text.strip() + ' '
347393
if self.options['wrap']:
348-
text = fill(text,
349-
width=self.options['wrap_width'],
350-
break_long_words=False,
351-
break_on_hyphens=False)
352-
return '%s\n\n' % text if text else ''
394+
# Preserve newlines (and preceding whitespace) resulting
395+
# from <br> tags. Newlines in the input have already been
396+
# replaced by spaces.
397+
lines = text.split('\n')
398+
new_lines = []
399+
for line in lines:
400+
line = line.lstrip()
401+
line_no_trailing = line.rstrip()
402+
trailing = line[len(line_no_trailing):]
403+
line = fill(line,
404+
width=self.options['wrap_width'],
405+
break_long_words=False,
406+
break_on_hyphens=False)
407+
new_lines.append(line + trailing)
408+
text = '\n'.join(new_lines)
409+
return '\n\n%s\n\n' % text if text else ''
353410

354411
def convert_pre(self, el, text, convert_as_inline):
355412
if not text:

pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "markdownify"
7-
version = "0.13.1"
7+
version = "0.14.0"
88
authors = [{name = "Matthew Tretter", email = "[email protected]"}]
99
description = "Convert HTML to markdown."
1010
readme = "README.rst"

tests/test_advanced.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ def test_chomp():
1414

1515
def test_nested():
1616
text = md('<p>This is an <a href="http://example.com/">example link</a>.</p>')
17-
assert text == 'This is an [example link](http://example.com/).\n\n'
17+
assert text == '\n\nThis is an [example link](http://example.com/).\n\n'
1818

1919

2020
def test_ignore_comments():

tests/test_basic.py

+1
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,4 @@ def test_soup():
1111

1212
def test_whitespace():
1313
assert md(' a b \t\t c ') == ' a b c '
14+
assert md(' a b \n\n c ') == ' a b\nc '

0 commit comments

Comments
 (0)