Skip to content

Commit be3a7f4

Browse files
committed
Merge branch 'develop'
2 parents 8219d2a + 3b4a014 commit be3a7f4

File tree

4 files changed

+170
-17
lines changed

4 files changed

+170
-17
lines changed

README.rst

+21-1
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,12 @@ Creating Custom Converters
156156

157157
If you have a special usecase that calls for a special conversion, you can
158158
always inherit from ``MarkdownConverter`` and override the method you want to
159-
change:
159+
change.
160+
The function that handles a HTML tag named ``abc`` is called
161+
``convert_abc(self, el, text, convert_as_inline)`` and returns a string
162+
containing the converted HTML tag.
163+
The ``MarkdownConverter`` object will handle the conversion based on the
164+
function names:
160165

161166
.. code:: python
162167
@@ -173,6 +178,21 @@ change:
173178
def md(html, **options):
174179
return ImageBlockConverter(**options).convert(html)
175180
181+
.. code:: python
182+
183+
from markdownify import MarkdownConverter
184+
185+
class IgnoreParagraphsConverter(MarkdownConverter):
186+
"""
187+
Create a custom MarkdownConverter that ignores paragraphs
188+
"""
189+
def convert_p(self, el, text, convert_as_inline):
190+
return ''
191+
192+
# Create shorthand method for conversion
193+
def md(html, **options):
194+
return IgnoreParagraphsConverter(**options).convert(html)
195+
176196
177197
Command Line Interface
178198
======================

markdownify/__init__.py

+38-11
Original file line numberDiff line numberDiff line change
@@ -152,13 +152,12 @@ def is_nested_node(el):
152152
def process_text(self, el):
153153
text = six.text_type(el) or ''
154154

155-
# dont remove any whitespace when handling pre or code in pre
156-
if not (el.parent.name == 'pre'
157-
or (el.parent.name == 'code'
158-
and el.parent.parent.name == 'pre')):
155+
# normalize whitespace if we're not inside a preformatted element
156+
if not el.find_parent('pre'):
159157
text = whitespace_re.sub(' ', text)
160158

161-
if el.parent.name != 'code' and el.parent.name != 'pre':
159+
# escape special characters if we're not inside a preformatted or code element
160+
if not el.find_parent(['pre', 'code', 'kbd', 'samp']):
162161
text = self.escape(text)
163162

164163
# remove trailing whitespaces if any of the following condition is true:
@@ -238,7 +237,7 @@ def convert_blockquote(self, el, text, convert_as_inline):
238237
if convert_as_inline:
239238
return text
240239

241-
return '\n' + (line_beginning_re.sub('> ', text) + '\n\n') if text else ''
240+
return '\n' + (line_beginning_re.sub('> ', text.strip()) + '\n\n') if text else ''
242241

243242
def convert_br(self, el, text, convert_as_inline):
244243
if convert_as_inline:
@@ -266,7 +265,7 @@ def convert_hn(self, n, el, text, convert_as_inline):
266265
return text
267266

268267
style = self.options['heading_style'].lower()
269-
text = text.rstrip()
268+
text = text.strip()
270269
if style == UNDERLINED and n <= 2:
271270
line = '=' if n == 1 else '-'
272271
return self.underline(text, line)
@@ -351,6 +350,12 @@ def convert_pre(self, el, text, convert_as_inline):
351350

352351
return '\n```%s\n%s\n```\n' % (code_language, text)
353352

353+
def convert_script(self, el, text, convert_as_inline):
354+
return ''
355+
356+
def convert_style(self, el, text, convert_as_inline):
357+
return ''
358+
354359
convert_s = convert_del
355360

356361
convert_strong = convert_b
@@ -364,20 +369,42 @@ def convert_pre(self, el, text, convert_as_inline):
364369
def convert_table(self, el, text, convert_as_inline):
365370
return '\n\n' + text + '\n'
366371

372+
def convert_caption(self, el, text, convert_as_inline):
373+
return text + '\n'
374+
375+
def convert_figcaption(self, el, text, convert_as_inline):
376+
return '\n\n' + text + '\n\n'
377+
367378
def convert_td(self, el, text, convert_as_inline):
368-
return ' ' + text + ' |'
379+
colspan = 1
380+
if 'colspan' in el.attrs:
381+
colspan = int(el['colspan'])
382+
return ' ' + text.strip().replace("\n", " ") + ' |' * colspan
369383

370384
def convert_th(self, el, text, convert_as_inline):
371-
return ' ' + text + ' |'
385+
colspan = 1
386+
if 'colspan' in el.attrs:
387+
colspan = int(el['colspan'])
388+
return ' ' + text.strip().replace("\n", " ") + ' |' * colspan
372389

373390
def convert_tr(self, el, text, convert_as_inline):
374391
cells = el.find_all(['td', 'th'])
375-
is_headrow = all([cell.name == 'th' for cell in cells])
392+
is_headrow = (
393+
all([cell.name == 'th' for cell in cells])
394+
or (not el.previous_sibling and not el.parent.name == 'tbody')
395+
or (not el.previous_sibling and el.parent.name == 'tbody' and len(el.parent.parent.find_all(['thead'])) < 1)
396+
)
376397
overline = ''
377398
underline = ''
378399
if is_headrow and not el.previous_sibling:
379400
# first row and is headline: print headline underline
380-
underline += '| ' + ' | '.join(['---'] * len(cells)) + ' |' + '\n'
401+
full_colspan = 0
402+
for cell in cells:
403+
if "colspan" in cell.attrs:
404+
full_colspan += int(cell["colspan"])
405+
else:
406+
full_colspan += 1
407+
underline += '| ' + ' | '.join(['---'] * full_colspan) + ' |' + '\n'
381408
elif (not el.previous_sibling
382409
and (el.parent.name == 'table'
383410
or (el.parent.name == 'tbody'

tests/test_conversions.py

+38-3
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,12 @@ def test_b_spaces():
5252

5353
def test_blockquote():
5454
assert md('<blockquote>Hello</blockquote>') == '\n> Hello\n\n'
55+
assert md('<blockquote>\nHello\n</blockquote>') == '\n> Hello\n\n'
56+
57+
58+
def test_blockquote_with_nested_paragraph():
59+
assert md('<blockquote><p>Hello</p></blockquote>') == '\n> Hello\n\n'
60+
assert md('<blockquote><p>Hello</p><p>Hello again</p></blockquote>') == '\n> Hello\n> \n> Hello again\n\n'
5561

5662

5763
def test_blockquote_with_paragraph():
@@ -60,17 +66,27 @@ def test_blockquote_with_paragraph():
6066

6167
def test_blockquote_nested():
6268
text = md('<blockquote>And she was like <blockquote>Hello</blockquote></blockquote>')
63-
assert text == '\n> And she was like \n> > Hello\n> \n> \n\n'
69+
assert text == '\n> And she was like \n> > Hello\n\n'
6470

6571

6672
def test_br():
6773
assert md('a<br />b<br />c') == 'a \nb \nc'
6874
assert md('a<br />b<br />c', newline_style=BACKSLASH) == 'a\\\nb\\\nc'
6975

7076

77+
def test_caption():
78+
assert md('TEXT<figure><figcaption>Caption</figcaption><span>SPAN</span></figure>') == 'TEXT\n\nCaption\n\nSPAN'
79+
assert md('<figure><span>SPAN</span><figcaption>Caption</figcaption></figure>TEXT') == 'SPAN\n\nCaption\n\nTEXT'
80+
81+
7182
def test_code():
7283
inline_tests('code', '`')
73-
assert md('<code>this_should_not_escape</code>') == '`this_should_not_escape`'
84+
assert md('<code>*this_should_not_escape*</code>') == '`*this_should_not_escape*`'
85+
assert md('<kbd>*this_should_not_escape*</kbd>') == '`*this_should_not_escape*`'
86+
assert md('<samp>*this_should_not_escape*</samp>') == '`*this_should_not_escape*`'
87+
assert md('<code><span>*this_should_not_escape*</span></code>') == '`*this_should_not_escape*`'
88+
assert md('<code>this should\t\tnormalize</code>') == '`this should normalize`'
89+
assert md('<code><span>this should\t\tnormalize</span></code>') == '`this should normalize`'
7490

7591

7692
def test_del():
@@ -85,6 +101,14 @@ def test_em():
85101
inline_tests('em', '*')
86102

87103

104+
def test_header_with_space():
105+
assert md('<h3>\n\nHello</h3>') == '### Hello\n\n'
106+
assert md('<h4>\n\nHello</h4>') == '#### Hello\n\n'
107+
assert md('<h5>\n\nHello</h5>') == '##### Hello\n\n'
108+
assert md('<h5>\n\nHello\n\n</h5>') == '##### Hello\n\n'
109+
assert md('<h5>\n\nHello \n\n</h5>') == '##### Hello\n\n'
110+
111+
88112
def test_h1():
89113
assert md('<h1>Hello</h1>') == 'Hello\n=====\n\n'
90114

@@ -187,7 +211,18 @@ def test_p():
187211
def test_pre():
188212
assert md('<pre>test\n foo\nbar</pre>') == '\n```\ntest\n foo\nbar\n```\n'
189213
assert md('<pre><code>test\n foo\nbar</code></pre>') == '\n```\ntest\n foo\nbar\n```\n'
190-
assert md('<pre>this_should_not_escape</pre>') == '\n```\nthis_should_not_escape\n```\n'
214+
assert md('<pre>*this_should_not_escape*</pre>') == '\n```\n*this_should_not_escape*\n```\n'
215+
assert md('<pre><span>*this_should_not_escape*</span></pre>') == '\n```\n*this_should_not_escape*\n```\n'
216+
assert md('<pre>\t\tthis should\t\tnot normalize</pre>') == '\n```\n\t\tthis should\t\tnot normalize\n```\n'
217+
assert md('<pre><span>\t\tthis should\t\tnot normalize</span></pre>') == '\n```\n\t\tthis should\t\tnot normalize\n```\n'
218+
219+
220+
def test_script():
221+
assert md('foo <script>var foo=42;</script> bar') == 'foo bar'
222+
223+
224+
def test_style():
225+
assert md('foo <style>h1 { font-size: larger }</style> bar') == 'foo bar'
191226

192227

193228
def test_s():

tests/test_tables.py

+73-2
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,26 @@
5757
</tr>
5858
</table>"""
5959

60+
table_with_linebreaks = """<table>
61+
<tr>
62+
<th>Firstname</th>
63+
<th>Lastname</th>
64+
<th>Age</th>
65+
</tr>
66+
<tr>
67+
<td>Jill</td>
68+
<td>Smith
69+
Jackson</td>
70+
<td>50</td>
71+
</tr>
72+
<tr>
73+
<td>Eve</td>
74+
<td>Jackson
75+
Smith</td>
76+
<td>94</td>
77+
</tr>
78+
</table>"""
79+
6080

6181
table_with_header_column = """<table>
6282
<tr>
@@ -99,6 +119,28 @@
99119
</tbody>
100120
</table>"""
101121

122+
table_head_body_missing_head = """<table>
123+
<thead>
124+
<tr>
125+
<td>Firstname</td>
126+
<td>Lastname</td>
127+
<td>Age</td>
128+
</tr>
129+
</thead>
130+
<tbody>
131+
<tr>
132+
<td>Jill</td>
133+
<td>Smith</td>
134+
<td>50</td>
135+
</tr>
136+
<tr>
137+
<td>Eve</td>
138+
<td>Jackson</td>
139+
<td>94</td>
140+
</tr>
141+
</tbody>
142+
</table>"""
143+
102144
table_missing_text = """<table>
103145
<thead>
104146
<tr>
@@ -159,13 +201,42 @@
159201
</tbody>
160202
</table>"""
161203

204+
table_with_caption = """TEXT<table><caption>Caption</caption>
205+
<tbody><tr><td>Firstname</td>
206+
<td>Lastname</td>
207+
<td>Age</td>
208+
</tr>
209+
</tbody>
210+
</table>"""
211+
212+
table_with_colspan = """<table>
213+
<tr>
214+
<th colspan="2">Name</th>
215+
<th>Age</th>
216+
</tr>
217+
<tr>
218+
<td>Jill</td>
219+
<td>Smith</td>
220+
<td>50</td>
221+
</tr>
222+
<tr>
223+
<td>Eve</td>
224+
<td>Jackson</td>
225+
<td>94</td>
226+
</tr>
227+
</table>"""
228+
162229

163230
def test_table():
164231
assert md(table) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
165232
assert md(table_with_html_content) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| **Jill** | *Smith* | [50](#) |\n| Eve | Jackson | 94 |\n\n'
166233
assert md(table_with_paragraphs) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
234+
assert md(table_with_linebreaks) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith Jackson | 50 |\n| Eve | Jackson Smith | 94 |\n\n'
167235
assert md(table_with_header_column) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
168236
assert md(table_head_body) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
237+
assert md(table_head_body_missing_head) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
169238
assert md(table_missing_text) == '\n\n| | Lastname | Age |\n| --- | --- | --- |\n| Jill | | 50 |\n| Eve | Jackson | 94 |\n\n'
170-
assert md(table_missing_head) == '\n\n| | | |\n| --- | --- | --- |\n| Firstname | Lastname | Age |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
171-
assert md(table_body) == '\n\n| | | |\n| --- | --- | --- |\n| Firstname | Lastname | Age |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
239+
assert md(table_missing_head) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
240+
assert md(table_body) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
241+
assert md(table_with_caption) == 'TEXT\n\nCaption\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n\n'
242+
assert md(table_with_colspan) == '\n\n| Name | | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'

0 commit comments

Comments
 (0)