Merge branch 'develop'

AlexVonB · AlexVonB · commit e9cc01938a50 · 2020-08-09T21:20:44.000+02:00
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,7 @@
 *.pyc
 *.egg
+.eggs/
+*.egg-info/
 .DS_Store
 /.env
 /dist
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+The MIT License (MIT)
+
+Copyright 2012-2018 Matthew Tretter
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/markdownify/__init__.py b/markdownify/__init__.py
@@ -1,5 +1,6 @@
 from bs4 import BeautifulSoup, NavigableString
 import re
+import six
 
 
 convert_heading_re = re.compile(r'convert_h(\d+)')
@@ -22,6 +23,19 @@ def escape(text):
     return text.replace('_', r'\_')
 
 
+def chomp(text):
+    """
+    If the text in an inline tag like b, a, or em contains a leading or trailing
+    space, strip the string and return a space as suffix of prefix, if needed.
+    This function is used to prevent conversions like
+        <b> foo</b> => ** foo**
+    """
+    prefix = ' ' if text and text[0] == ' ' else ''
+    suffix = ' ' if text and text[-1] == ' ' else ''
+    text = text.strip()
+    return (prefix, suffix, text)
+
+
 def _todict(obj):
     return dict((k, getattr(obj, k)) for k in dir(obj) if not k.startswith('_'))
 
@@ -52,7 +66,7 @@ def convert(self, html):
         # want a full document. Therefore, we'll mark our fragment with an id,
         # create the document, and extract the element with the id.
         html = wrapped % html
-        soup = BeautifulSoup(html)
+        soup = BeautifulSoup(html, 'html.parser')
         return self.process_tag(soup.find(id=FRAGMENT_ID), children_only=True)
 
     def process_tag(self, node, children_only=False):
@@ -61,7 +75,7 @@ def process_tag(self, node, children_only=False):
         # Convert the children first
         for el in node.children:
             if isinstance(el, NavigableString):
-                text += self.process_text(unicode(el))
+                text += self.process_text(six.text_type(el))
             else:
                 text += self.process_tag(el)
 
@@ -109,13 +123,16 @@ def underline(self, text, pad_char):
         return '%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''
 
     def convert_a(self, el, text):
+        prefix, suffix, text = chomp(text)
+        if not text:
+            return ''
         href = el.get('href')
         title = el.get('title')
         if self.options['autolinks'] and text == href and not title:
             # Shortcut syntax
             return '<%s>' % href
         title_part = ' "%s"' % title.replace('"', r'\"') if title else ''
-        return '[%s](%s%s)' % (text or '', href, title_part) if href else text or ''
+        return '%s[%s](%s%s)%s' % (prefix, text, href, title_part, suffix) if href else text
 
     def convert_b(self, el, text):
         return self.convert_strong(el, text)
@@ -127,7 +144,10 @@ def convert_br(self, el, text):
         return '  \n'
 
     def convert_em(self, el, text):
-        return '*%s*' % text if text else ''
+        prefix, suffix, text = chomp(text)
+        if not text:
+            return ''
+        return '%s*%s*%s' % (prefix, text, suffix)
 
     def convert_hn(self, n, el, text):
         style = self.options['heading_style']
@@ -151,8 +171,9 @@ def convert_list(self, el, text):
                 break
             el = el.parent
         if nested:
-            text = '\n' + self.indent(text, 1)
-        return text
+            # remove trailing newline if nested
+            return '\n' + self.indent(text, 1).rstrip()
+        return '\n' + text + '\n'
 
     convert_ul = convert_list
     convert_ol = convert_list
@@ -175,7 +196,10 @@ def convert_p(self, el, text):
         return '%s\n\n' % text if text else ''
 
     def convert_strong(self, el, text):
-        return '**%s**' % text if text else ''
+        prefix, suffix, text = chomp(text)
+        if not text:
+            return ''
+        return '%s**%s**%s' % (prefix, text, suffix)
 
     def convert_img(self, el, text):
         alt = el.attrs.get('alt', None) or ''
diff --git a/markdownify/pkgmeta.py b/markdownify/pkgmeta.py
diff --git a/setup.py b/setup.py
@@ -7,10 +7,11 @@
 
 read = lambda filepath: codecs.open(filepath, 'r', 'utf-8').read()
 
-
-pkgmeta = {}
-execfile(os.path.join(os.path.dirname(__file__), 'markdownify', 'pkgmeta.py'),
-         pkgmeta)
+pkgmeta = {
+    '__title__': 'markdownify',
+    '__author__': 'Matthew Tretter',
+    '__version__': '0.4.1',
+}
 
 
 class PyTest(TestCommand):
@@ -75,13 +76,13 @@ def run(self):
         'pytest',
     ],
     install_requires=[
-        'beautifulsoup4',
+        'beautifulsoup4', 'six'
     ],
     classifiers=[
         'Environment :: Web Environment',
         'Framework :: Django',
         'Intended Audience :: Developers',
-        'License :: OSI Approved :: BSD License',
+        'License :: OSI Approved :: MIT License',
         'Operating System :: OS Independent',
         'Programming Language :: Python :: 2.5',
         'Programming Language :: Python :: 2.6',
diff --git a/tests/test_conversions.py b/tests/test_conversions.py
@@ -2,7 +2,7 @@
 import re
 
 
-nested_uls = re.sub('\s+', '', """
+nested_uls = re.sub(r'\s+', '', """
     <ul>
         <li>1
             <ul>
@@ -22,10 +22,28 @@
     </ul>""")
 
 
+def test_chomp():
+    assert md(' <b></b> ') == '  '
+    assert md(' <b> </b> ') == '  '
+    assert md(' <b>  </b> ') == '  '
+    assert md(' <b>   </b> ') == '  '
+    assert md(' <b>s </b> ') == ' **s**  '
+    assert md(' <b> s</b> ') == '  **s** '
+    assert md(' <b> s </b> ') == '  **s**  '
+    assert md(' <b>  s  </b> ') == '  **s**  '
+
+
 def test_a():
     assert md('<a href="http://google.com">Google</a>') == '[Google](http://google.com)'
 
 
+def test_a_spaces():
+    assert md('foo <a href="http://google.com">Google</a> bar') == 'foo [Google](http://google.com) bar'
+    assert md('foo<a href="http://google.com"> Google</a> bar') == 'foo [Google](http://google.com) bar'
+    assert md('foo <a href="http://google.com">Google </a>bar') == 'foo [Google](http://google.com) bar'
+    assert md('foo <a href="http://google.com"></a> bar') == 'foo  bar'
+
+
 def test_a_with_title():
     text = md('<a href="http://google.com" title="The &quot;Goog&quot;">Google</a>')
     assert text == r'[Google](http://google.com "The \"Goog\"")'
@@ -45,6 +63,13 @@ def test_b():
     assert md('<b>Hello</b>') == '**Hello**'
 
 
+def test_b_spaces():
+    assert md('foo <b>Hello</b> bar') == 'foo **Hello** bar'
+    assert md('foo<b> Hello</b> bar') == 'foo **Hello** bar'
+    assert md('foo <b>Hello </b>bar') == 'foo **Hello** bar'
+    assert md('foo <b></b> bar') == 'foo  bar'
+
+
 def test_blockquote():
     assert md('<blockquote>Hello</blockquote>').strip() == '> Hello'
 
@@ -62,6 +87,13 @@ def test_em():
     assert md('<em>Hello</em>') == '*Hello*'
 
 
+def test_em_spaces():
+    assert md('foo <em>Hello</em> bar') == 'foo *Hello* bar'
+    assert md('foo<em> Hello</em> bar') == 'foo *Hello* bar'
+    assert md('foo <em>Hello </em>bar') == 'foo *Hello* bar'
+    assert md('foo <em></em> bar') == 'foo  bar'
+
+
 def test_h1():
     assert md('<h1>Hello</h1>') == 'Hello\n=====\n\n'
 
@@ -90,7 +122,7 @@ def test_i():
 
 
 def test_ol():
-    assert md('<ol><li>a</li><li>b</li></ol>') == '1. a\n2. b\n'
+    assert md('<ol><li>a</li><li>b</li></ol>') == '\n1. a\n2. b\n\n'
 
 
 def test_p():
@@ -102,19 +134,23 @@ def test_strong():
 
 
 def test_ul():
-    assert md('<ul><li>a</li><li>b</li></ul>') == '* a\n* b\n'
+    assert md('<ul><li>a</li><li>b</li></ul>') == '\n* a\n* b\n\n'
+
+
+def test_inline_ul():
+    assert md('<p>foo</p><ul><li>a</li><li>b</li></ul><p>bar</p>') == 'foo\n\n\n* a\n* b\n\nbar\n\n'
 
 
 def test_nested_uls():
     """
     Nested ULs should alternate bullet characters.
 
     """
-    assert md(nested_uls) == '* 1\n\t+ a\n\t\t- I\n\t\t- II\n\t\t- III\n\t\t\n\t+ b\n\t+ c\n\t\n* 2\n* 3\n'
+    assert md(nested_uls) == '\n* 1\n\t+ a\n\t\t- I\n\t\t- II\n\t\t- III\n\t+ b\n\t+ c\n* 2\n* 3\n\n'
 
 
 def test_bullets():
-    assert md(nested_uls, bullets='-') == '- 1\n\t- a\n\t\t- I\n\t\t- II\n\t\t- III\n\t\t\n\t- b\n\t- c\n\t\n- 2\n- 3\n'
+    assert md(nested_uls, bullets='-') == '\n- 1\n\t- a\n\t\t- I\n\t\t- II\n\t\t- III\n\t- b\n\t- c\n- 2\n- 3\n\n'
 
 
 def test_img():

-Original file line number
+Diff line change
@@ @@ -1,5 +1,7 @@ @@
 *.pyc
 *.egg
 +.eggs/
 +*.egg-info/
 .DS_Store
 /.env
 /dist