7
7
convert_heading_re = re .compile (r'convert_h(\d+)' )
8
8
line_beginning_re = re .compile (r'^' , re .MULTILINE )
9
9
whitespace_re = re .compile (r'[\t ]+' )
10
- all_whitespace_re = re .compile (r'[\s]+' )
10
+ all_whitespace_re = re .compile (r'[\t \r\n]+' )
11
+ newline_whitespace_re = re .compile (r'[\t \r\n]*[\r\n][\t \r\n]*' )
11
12
html_heading_re = re .compile (r'h[1-6]' )
12
13
13
14
@@ -66,6 +67,23 @@ def _todict(obj):
66
67
return dict ((k , getattr (obj , k )) for k in dir (obj ) if not k .startswith ('_' ))
67
68
68
69
70
+ def should_remove_whitespace_inside (el ):
71
+ """Return to remove whitespace immediately inside a block-level element."""
72
+ if not el or not el .name :
73
+ return False
74
+ if html_heading_re .match (el .name ) is not None :
75
+ return True
76
+ return el .name in ('p' , 'blockquote' ,
77
+ 'ol' , 'ul' , 'li' ,
78
+ 'table' , 'thead' , 'tbody' , 'tfoot' ,
79
+ 'tr' , 'td' , 'th' )
80
+
81
+
82
+ def should_remove_whitespace_outside (el ):
83
+ """Return to remove whitespace immediately outside a block-level element."""
84
+ return should_remove_whitespace_inside (el ) or (el and el .name == 'pre' )
85
+
86
+
69
87
class MarkdownConverter (object ):
70
88
class DefaultOptions :
71
89
autolinks = True
@@ -76,7 +94,7 @@ class DefaultOptions:
76
94
default_title = False
77
95
escape_asterisks = True
78
96
escape_underscores = True
79
- escape_misc = True
97
+ escape_misc = False
80
98
heading_style = UNDERLINED
81
99
keep_inline_images_in = []
82
100
newline_style = SPACES
@@ -119,27 +137,23 @@ def process_tag(self, node, convert_as_inline, children_only=False):
119
137
if not children_only and (isHeading or isCell ):
120
138
convert_children_as_inline = True
121
139
122
- # Remove whitespace-only textnodes in purely nested nodes
123
- def is_nested_node (el ):
124
- return el and el .name in ['ol' , 'ul' , 'li' ,
125
- 'table' , 'thead' , 'tbody' , 'tfoot' ,
126
- 'tr' , 'td' , 'th' ]
127
-
128
- if is_nested_node (node ):
129
- for el in node .children :
130
- # Only extract (remove) whitespace-only text node if any of the
131
- # conditions is true:
132
- # - el is the first element in its parent
133
- # - el is the last element in its parent
134
- # - el is adjacent to an nested node
135
- can_extract = (not el .previous_sibling
136
- or not el .next_sibling
137
- or is_nested_node (el .previous_sibling )
138
- or is_nested_node (el .next_sibling ))
139
- if (isinstance (el , NavigableString )
140
- and six .text_type (el ).strip () == ''
141
- and can_extract ):
142
- el .extract ()
140
+ # Remove whitespace-only textnodes just before, after or
141
+ # inside block-level elements.
142
+ should_remove_inside = should_remove_whitespace_inside (node )
143
+ for el in node .children :
144
+ # Only extract (remove) whitespace-only text node if any of the
145
+ # conditions is true:
146
+ # - el is the first element in its parent (block-level)
147
+ # - el is the last element in its parent (block-level)
148
+ # - el is adjacent to a block-level node
149
+ can_extract = (should_remove_inside and (not el .previous_sibling
150
+ or not el .next_sibling )
151
+ or should_remove_whitespace_outside (el .previous_sibling )
152
+ or should_remove_whitespace_outside (el .next_sibling ))
153
+ if (isinstance (el , NavigableString )
154
+ and six .text_type (el ).strip () == ''
155
+ and can_extract ):
156
+ el .extract ()
143
157
144
158
# Convert the children first
145
159
for el in node .children :
@@ -148,7 +162,13 @@ def is_nested_node(el):
148
162
elif isinstance (el , NavigableString ):
149
163
text += self .process_text (el )
150
164
else :
151
- text += self .process_tag (el , convert_children_as_inline )
165
+ text_strip = text .rstrip ('\n ' )
166
+ newlines_left = len (text ) - len (text_strip )
167
+ next_text = self .process_tag (el , convert_children_as_inline )
168
+ next_text_strip = next_text .lstrip ('\n ' )
169
+ newlines_right = len (next_text ) - len (next_text_strip )
170
+ newlines = '\n ' * max (newlines_left , newlines_right )
171
+ text = text_strip + newlines + next_text_strip
152
172
153
173
if not children_only :
154
174
convert_fn = getattr (self , 'convert_%s' % node .name , None )
@@ -162,18 +182,26 @@ def process_text(self, el):
162
182
163
183
# normalize whitespace if we're not inside a preformatted element
164
184
if not el .find_parent ('pre' ):
165
- text = whitespace_re .sub (' ' , text )
185
+ if self .options ['wrap' ]:
186
+ text = all_whitespace_re .sub (' ' , text )
187
+ else :
188
+ text = newline_whitespace_re .sub ('\n ' , text )
189
+ text = whitespace_re .sub (' ' , text )
166
190
167
191
# escape special characters if we're not inside a preformatted or code element
168
192
if not el .find_parent (['pre' , 'code' , 'kbd' , 'samp' ]):
169
193
text = self .escape (text )
170
194
171
- # remove trailing whitespaces if any of the following condition is true:
172
- # - current text node is the last node in li
173
- # - current text node is followed by an embedded list
174
- if (el .parent .name == 'li'
175
- and (not el .next_sibling
176
- or el .next_sibling .name in ['ul' , 'ol' ])):
195
+ # remove leading whitespace at the start or just after a
196
+ # block-level element; remove traliing whitespace at the end
197
+ # or just before a block-level element.
198
+ if (should_remove_whitespace_outside (el .previous_sibling )
199
+ or (should_remove_whitespace_inside (el .parent )
200
+ and not el .previous_sibling )):
201
+ text = text .lstrip ()
202
+ if (should_remove_whitespace_outside (el .next_sibling )
203
+ or (should_remove_whitespace_inside (el .parent )
204
+ and not el .next_sibling )):
177
205
text = text .rstrip ()
178
206
179
207
return text
@@ -208,20 +236,32 @@ def escape(self, text):
208
236
if not text :
209
237
return ''
210
238
if self .options ['escape_misc' ]:
211
- text = re .sub (r'([\\&<`[>~#=+|-])' , r'\\\1' , text )
212
- text = re .sub (r'([0-9])([.)])' , r'\1\\\2' , text )
239
+ text = re .sub (r'([\\&<`[>~=+|])' , r'\\\1' , text )
240
+ # A sequence of one or more consecutive '-', preceded and
241
+ # followed by whitespace or start/end of fragment, might
242
+ # be confused with an underline of a header, or with a
243
+ # list marker.
244
+ text = re .sub (r'(\s|^)(-+(?:\s|$))' , r'\1\\\2' , text )
245
+ # A sequence of up to six consecutive '#', preceded and
246
+ # followed by whitespace or start/end of fragment, might
247
+ # be confused with an ATX heading.
248
+ text = re .sub (r'(\s|^)(#{1,6}(?:\s|$))' , r'\1\\\2' , text )
249
+ # '.' or ')' preceded by up to nine digits might be
250
+ # confused with a list item.
251
+ text = re .sub (r'((?:\s|^)[0-9]{1,9})([.)](?:\s|$))' , r'\1\\\2' ,
252
+ text )
213
253
if self .options ['escape_asterisks' ]:
214
254
text = text .replace ('*' , r'\*' )
215
255
if self .options ['escape_underscores' ]:
216
256
text = text .replace ('_' , r'\_' )
217
257
return text
218
258
219
- def indent (self , text , level ):
220
- return line_beginning_re .sub ('\t ' * level , text ) if text else ''
259
+ def indent (self , text , columns ):
260
+ return line_beginning_re .sub (' ' * columns , text ) if text else ''
221
261
222
262
def underline (self , text , pad_char ):
223
263
text = (text or '' ).rstrip ()
224
- return '%s\n %s\n \n ' % (text , pad_char * len (text )) if text else ''
264
+ return '\n \n %s\n %s\n \n ' % (text , pad_char * len (text )) if text else ''
225
265
226
266
def convert_a (self , el , text , convert_as_inline ):
227
267
prefix , suffix , text = chomp (text )
@@ -246,7 +286,7 @@ def convert_a(self, el, text, convert_as_inline):
246
286
def convert_blockquote (self , el , text , convert_as_inline ):
247
287
248
288
if convert_as_inline :
249
- return text
289
+ return ' ' + text . strip () + ' '
250
290
251
291
return '\n ' + (line_beginning_re .sub ('> ' , text .strip ()) + '\n \n ' ) if text else ''
252
292
@@ -280,10 +320,11 @@ def convert_hn(self, n, el, text, convert_as_inline):
280
320
if style == UNDERLINED and n <= 2 :
281
321
line = '=' if n == 1 else '-'
282
322
return self .underline (text , line )
323
+ text = all_whitespace_re .sub (' ' , text )
283
324
hashes = '#' * n
284
325
if style == ATX_CLOSED :
285
- return '%s %s %s\n \n ' % (hashes , text , hashes )
286
- return '%s %s\n \n ' % (hashes , text )
326
+ return '\n %s %s %s\n \n ' % (hashes , text , hashes )
327
+ return '\n %s %s\n \n ' % (hashes , text )
287
328
288
329
def convert_hr (self , el , text , convert_as_inline ):
289
330
return '\n \n ---\n \n '
@@ -317,8 +358,8 @@ def convert_list(self, el, text, convert_as_inline):
317
358
el = el .parent
318
359
if nested :
319
360
# remove trailing newline if nested
320
- return '\n ' + self . indent ( text , 1 ) .rstrip ()
321
- return text + ('\n ' if before_paragraph else '' )
361
+ return '\n ' + text .rstrip ()
362
+ return ' \n \n ' + text + ('\n ' if before_paragraph else '' )
322
363
323
364
convert_ul = convert_list
324
365
convert_ol = convert_list
@@ -339,17 +380,33 @@ def convert_li(self, el, text, convert_as_inline):
339
380
el = el .parent
340
381
bullets = self .options ['bullets' ]
341
382
bullet = bullets [depth % len (bullets )]
342
- return '%s %s\n ' % (bullet , (text or '' ).strip ())
383
+ bullet = bullet + ' '
384
+ text = (text or '' ).strip ()
385
+ text = self .indent (text , len (bullet ))
386
+ if text :
387
+ text = bullet + text [len (bullet ):]
388
+ return '%s\n ' % text
343
389
344
390
def convert_p (self , el , text , convert_as_inline ):
345
391
if convert_as_inline :
346
- return text
392
+ return ' ' + text . strip () + ' '
347
393
if self .options ['wrap' ]:
348
- text = fill (text ,
349
- width = self .options ['wrap_width' ],
350
- break_long_words = False ,
351
- break_on_hyphens = False )
352
- return '%s\n \n ' % text if text else ''
394
+ # Preserve newlines (and preceding whitespace) resulting
395
+ # from <br> tags. Newlines in the input have already been
396
+ # replaced by spaces.
397
+ lines = text .split ('\n ' )
398
+ new_lines = []
399
+ for line in lines :
400
+ line = line .lstrip ()
401
+ line_no_trailing = line .rstrip ()
402
+ trailing = line [len (line_no_trailing ):]
403
+ line = fill (line ,
404
+ width = self .options ['wrap_width' ],
405
+ break_long_words = False ,
406
+ break_on_hyphens = False )
407
+ new_lines .append (line + trailing )
408
+ text = '\n ' .join (new_lines )
409
+ return '\n \n %s\n \n ' % text if text else ''
353
410
354
411
def convert_pre (self , el , text , convert_as_inline ):
355
412
if not text :
0 commit comments