From 94b1a20854cee30f2d4b282de1ca68c3ba591c73 Mon Sep 17 00:00:00 2001
From: Shevchenko Vitaliy <vetal4444@gmail.com>
Date: Wed, 8 Apr 2015 19:14:47 +0300
Subject: [PATCH 01/14] Add python3 support

---
 goose/__init__.py               |  9 ++++++---
 goose/cleaners.py               |  2 ++
 goose/configuration.py          |  9 +++++++--
 goose/extractors/content.py     |  2 +-
 goose/extractors/images.py      |  6 +++++-
 goose/extractors/metas.py       |  6 ++++--
 goose/image.py                  |  4 ++++
 goose/network.py                | 17 +++++++++--------
 goose/outputformatters.py       |  6 +++++-
 goose/parsers.py                |  7 +++++--
 goose/text.py                   | 13 ++++++++-----
 goose/utils/__init__.py         | 13 ++++++++++---
 goose/utils/encoding.py         | 28 +++++++++++++++-------------
 goose/utils/images.py           | 10 +++++++---
 setup.py                        |  2 +-
 tests/extractors/authors.py     | 20 +++++++++++++++++---
 tests/extractors/base.py        | 11 +++++++++--
 tests/extractors/content.py     |  4 +++-
 tests/extractors/images.py      |  6 ++++--
 tests/extractors/links.py       |  4 +++-
 tests/extractors/metas.py       |  4 +++-
 tests/extractors/opengraph.py   |  4 +++-
 tests/extractors/publishdate.py |  4 +++-
 tests/extractors/tags.py        |  4 +++-
 tests/extractors/title.py       |  4 +++-
 tests/extractors/tweets.py      |  3 ++-
 tests/extractors/videos.py      |  4 +++-
 tests/parsers.py                |  3 +++
 28 files changed, 148 insertions(+), 61 deletions(-)

diff --git a/goose/__init__.py b/goose/__init__.py
index 409b5732..f267fa34 100644
--- a/goose/__init__.py
+++ b/goose/__init__.py
@@ -64,9 +64,12 @@ def crawl(self, crawl_candiate):
         try:
             crawler = Crawler(self.config)
             article = crawler.crawl(crawl_candiate)
-        except (UnicodeDecodeError, ValueError):
-            self.config.parser_class = parsers[0]
-            return self.crawl(crawl_candiate)
+        except (UnicodeDecodeError, ValueError) as e:
+            if parsers:
+                self.config.parser_class = parsers[0]
+                return self.crawl(crawl_candiate)
+            else:
+                raise e
         return article
 
     def initialize(self):
diff --git a/goose/cleaners.py b/goose/cleaners.py
index c1384ee0..9ab45b6d 100644
--- a/goose/cleaners.py
+++ b/goose/cleaners.py
@@ -20,6 +20,8 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 """
+from __future__ import unicode_literals
+
 from goose.utils import ReplaceSequence
 
 
diff --git a/goose/configuration.py b/goose/configuration.py
index fcfa5b9a..4913f699 100644
--- a/goose/configuration.py
+++ b/goose/configuration.py
@@ -22,6 +22,9 @@
 """
 import os
 import tempfile
+
+import six
+
 from goose.text import StopWords
 from goose.parsers import Parser
 from goose.parsers import ParserSoup
@@ -30,10 +33,12 @@
 HTTP_DEFAULT_TIMEOUT = 30
 
 AVAILABLE_PARSERS = {
-    'lxml': Parser,
-    'soup': ParserSoup,
+    'lxml': Parser
 }
 
+if six.PY2:
+    AVAILABLE_PARSERS['soup'] = ParserSoup
+
 
 class Configuration(object):
 
diff --git a/goose/extractors/content.py b/goose/extractors/content.py
index e0703d55..afdc2c91 100644
--- a/goose/extractors/content.py
+++ b/goose/extractors/content.py
@@ -260,7 +260,7 @@ def update_score(self, node, addToScore):
         if score_string:
             current_score = int(score_string)
 
-        new_score = current_score + addToScore
+        new_score = current_score + int(addToScore)
         self.parser.setAttribute(node, "gravityScore", str(new_score))
 
     def update_node_count(self, node, add_to_count):
diff --git a/goose/extractors/images.py b/goose/extractors/images.py
index 3af44f5f..2dd63786 100644
--- a/goose/extractors/images.py
+++ b/goose/extractors/images.py
@@ -23,7 +23,11 @@
 import re
 import os
 
-from urlparse import urlparse, urljoin
+try:
+    from urlparse import urlparse, urljoin
+except ImportError:
+    from urllib.parse import urlparse, urljoin
+
 
 from goose.extractors import BaseExtractor
 from goose.image import Image
diff --git a/goose/extractors/metas.py b/goose/extractors/metas.py
index 95acadd5..ee7d520e 100644
--- a/goose/extractors/metas.py
+++ b/goose/extractors/metas.py
@@ -22,8 +22,10 @@
 """
 
 import re
-from urlparse import urljoin
-from urlparse import urlparse
+try:
+    from urlparse import urlparse, urljoin
+except ImportError:
+    from urllib.parse import urlparse, urljoin
 
 from goose.extractors import BaseExtractor
 
diff --git a/goose/image.py b/goose/image.py
index 351e3396..23026398 100644
--- a/goose/image.py
+++ b/goose/image.py
@@ -20,6 +20,10 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 """
+try:
+    long
+except NameError:
+    long = int
 
 
 class Image(object):
diff --git a/goose/network.py b/goose/network.py
index 666a7d61..2b8265ad 100644
--- a/goose/network.py
+++ b/goose/network.py
@@ -20,7 +20,12 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 """
-import urllib2
+import six
+
+try:
+    from urllib2 import urlopen, Request
+except ImportError:
+    from urllib.request import urlopen, Request
 
 
 class HtmlFetcher(object):
@@ -39,18 +44,14 @@ def get_url(self):
 
     def get_html(self, url):
         # utf-8 encode unicode url
-        if isinstance(url, unicode):
+        if isinstance(url, six.text_type) and six.PY2:
             url = url.encode('utf-8')
 
         # set request
-        self.request = urllib2.Request(
-                        url,
-                        headers=self.headers)
+        self.request = Request(url, headers=self.headers)
         # do request
         try:
-            self.result = urllib2.urlopen(
-                            self.request,
-                            timeout=self.config.http_timeout)
+            self.result = urlopen(self.request, timeout=self.config.http_timeout)
         except Exception:
             self.result = None
 
diff --git a/goose/outputformatters.py b/goose/outputformatters.py
index 1f8ba4bd..21dab451 100644
--- a/goose/outputformatters.py
+++ b/goose/outputformatters.py
@@ -20,7 +20,11 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 """
-from HTMLParser import HTMLParser
+try:
+    from HTMLParser import HTMLParser
+except ImportError:
+    from html.parser import HTMLParser
+
 from goose.text import innerTrim
 
 
diff --git a/goose/parsers.py b/goose/parsers.py
index a43e9b47..c0b091a9 100644
--- a/goose/parsers.py
+++ b/goose/parsers.py
@@ -21,7 +21,9 @@
 limitations under the License.
 """
 import lxml.html
-from lxml.html import soupparser
+
+import six
+
 from lxml import etree
 from copy import deepcopy
 from goose.text import innerTrim
@@ -56,7 +58,7 @@ def fromstring(self, html):
 
     @classmethod
     def nodeToString(self, node):
-        return etree.tostring(node)
+        return etree.tostring(node, encoding=six.text_type)
 
     @classmethod
     def replaceTag(self, node, tag):
@@ -239,6 +241,7 @@ class ParserSoup(Parser):
 
     @classmethod
     def fromstring(self, html):
+        from lxml.html import soupparser
         html = encodeValue(html)
         self.doc = soupparser.fromstring(html)
         return self.doc
diff --git a/goose/text.py b/goose/text.py
index 3ef63d6b..02846e20 100644
--- a/goose/text.py
+++ b/goose/text.py
@@ -23,6 +23,9 @@
 import os
 import re
 import string
+
+import six
+
 from goose.utils import FileHelper
 from goose.utils.encoding import smart_unicode
 from goose.utils.encoding import smart_str
@@ -32,7 +35,7 @@
 
 
 def innerTrim(value):
-    if isinstance(value, (unicode, str)):
+    if isinstance(value, (six.text_type, six.string_types)):
         # remove tab and white space
         value = re.sub(TABSSPACE, ' ', value)
         value = ''.join(value.splitlines())
@@ -87,7 +90,6 @@ def set_word_count(self, cnt):
 class StopWords(object):
 
     PUNCTUATION = re.compile("[^\\p{Ll}\\p{Lu}\\p{Lt}\\p{Lo}\\p{Nd}\\p{Pc}\\s]")
-    TRANS_TABLE = string.maketrans('', '')
     _cached_stop_words = {}
 
     def __init__(self, language='en'):
@@ -106,9 +108,10 @@ def __init__(self, language='en'):
     def remove_punctuation(self, content):
         # code taken form
         # http://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string-in-python
-        if isinstance(content, unicode):
-            content = content.encode('utf-8')
-        return content.translate(self.TRANS_TABLE, string.punctuation)
+        if not isinstance(content, six.text_type):
+            content = content.decode('utf-8')
+        tbl = dict.fromkeys(ord(x) for x in string.punctuation)
+        return content.translate(tbl)
 
     def candiate_words(self, stripped_input):
         return stripped_input.split(' ')
diff --git a/goose/utils/__init__.py b/goose/utils/__init__.py
index 5a1de7d4..41cf9c95 100644
--- a/goose/utils/__init__.py
+++ b/goose/utils/__init__.py
@@ -26,7 +26,13 @@
 import os
 import goose
 import codecs
-import urlparse
+
+import six
+
+try:
+    from urlparse import urlparse
+except ImportError:
+    from urllib.parse import urlparse
 
 
 class BuildURL(object):
@@ -89,7 +95,7 @@ def __init__(self, urlString, link_hash):
 class RawHelper(object):
     @classmethod
     def get_parsing_candidate(self, url, raw_html):
-        if isinstance(raw_html, unicode):
+        if isinstance(raw_html, six.text_type):
             raw_html = raw_html.encode('utf-8')
         link_hash = '%s.%s' % (hashlib.md5(raw_html).hexdigest(), time.time())
         return ParsingCandidate(url, link_hash)
@@ -101,7 +107,8 @@ def get_parsing_candidate(self, url_to_crawl):
         # replace shebang is urls
         final_url = url_to_crawl.replace('#!', '?_escaped_fragment_=') \
                     if '#!' in url_to_crawl else url_to_crawl
-        link_hash = '%s.%s' % (hashlib.md5(final_url).hexdigest(), time.time())
+        url = final_url.encode("utf-8") if isinstance(final_url, six.text_type) else final_url
+        link_hash = '%s.%s' % (hashlib.md5(url).hexdigest(), time.time())
         return ParsingCandidate(final_url, link_hash)
 
 
diff --git a/goose/utils/encoding.py b/goose/utils/encoding.py
index 4dc23ca7..eb98917c 100644
--- a/goose/utils/encoding.py
+++ b/goose/utils/encoding.py
@@ -1,6 +1,8 @@
 # -*- coding: utf-8 -*-
-import types
 import datetime
+
+import six
+
 from decimal import Decimal
 
 
@@ -45,8 +47,8 @@ def is_protected_type(obj):
     force_unicode(strings_only=True).
     """
     return isinstance(obj, (
-        types.NoneType,
-        int, long,
+        type(None),
+        six.integer_types,
         datetime.datetime, datetime.date, datetime.time,
         float, Decimal)
     )
@@ -62,17 +64,17 @@ def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
     # Handle the common case first, saves 30-40% in performance when s
     # is an instance of unicode. This function gets called often in that
     # setting.
-    if isinstance(s, unicode):
+    if isinstance(s, six.text_type):
         return s
     if strings_only and is_protected_type(s):
         return s
     try:
-        if not isinstance(s, basestring,):
+        if not isinstance(s, six.string_types,):
             if hasattr(s, '__unicode__'):
-                s = unicode(s)
+                s = s.__unicode__()
             else:
                 try:
-                    s = unicode(str(s), encoding, errors)
+                    s = six.text_type(s, encoding, errors)
                 except UnicodeEncodeError:
                     if not isinstance(s, Exception):
                         raise
@@ -84,12 +86,12 @@ def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'):
                     # output should be.
                     s = u' '.join([force_unicode(arg, encoding, strings_only,
                             errors) for arg in s])
-        elif not isinstance(s, unicode):
+        elif not isinstance(s, six.text_type):
             # Note: We use .decode() here, instead of unicode(s, encoding,
             # errors), so that if s is a SafeString, it ends up being a
             # SafeUnicode at the end.
             s = s.decode(encoding, errors)
-    except UnicodeDecodeError, e:
+    except UnicodeDecodeError as e:
         if not isinstance(s, Exception):
             raise DjangoUnicodeDecodeError(s, *e.args)
         else:
@@ -109,11 +111,11 @@ def smart_str(s, encoding='utf-8', strings_only=False, errors='strict'):
 
     If strings_only is True, don't convert (some) non-string-like objects.
     """
-    if strings_only and isinstance(s, (types.NoneType, int)):
+    if strings_only and isinstance(s, (type(None), int)):
         return s
     # if isinstance(s, Promise):
     #     return unicode(s).encode(encoding, errors)
-    if not isinstance(s, basestring):
+    if not isinstance(s, six.string_types):
         try:
             return str(s)
         except UnicodeEncodeError:
@@ -123,8 +125,8 @@ def smart_str(s, encoding='utf-8', strings_only=False, errors='strict'):
                 # further exception.
                 return ' '.join([smart_str(arg, encoding, strings_only,
                         errors) for arg in s])
-            return unicode(s).encode(encoding, errors)
-    elif isinstance(s, unicode):
+            return six.text_type(s).encode(encoding, errors)
+    elif isinstance(s, six.text_type):
         return s.encode(encoding, errors)
     elif s and encoding != 'utf-8':
         return s.decode('utf-8', errors).encode(encoding, errors)
diff --git a/goose/utils/images.py b/goose/utils/images.py
index 388d5c85..76a8c72f 100644
--- a/goose/utils/images.py
+++ b/goose/utils/images.py
@@ -22,8 +22,12 @@
 """
 import hashlib
 import os
-import urllib2
+try:
+    from urllib2 import urlopen, Request
+except ImportError:
+    from urllib.request import urlopen, Request
 from PIL import Image
+
 from goose.utils.encoding import smart_str
 from goose.image import ImageDetails
 from goose.image import LocallyStoredImage
@@ -115,8 +119,8 @@ def clean_src_string(self, src):
     @classmethod
     def fetch(self, http_client, src):
         try:
-            req = urllib2.Request(src)
-            f = urllib2.urlopen(req)
+            req = Request(src)
+            f = urlopen(req)
             data = f.read()
             return data
         except Exception:
diff --git a/setup.py b/setup.py
index ebad2547..fbe60081 100644
--- a/setup.py
+++ b/setup.py
@@ -66,6 +66,6 @@
     packages=find_packages(),
     include_package_data=True,
     zip_safe=False,
-    install_requires=['Pillow', 'lxml', 'cssselect', 'jieba', 'beautifulsoup', 'nltk'],
+    install_requires=['Pillow', 'lxml', 'cssselect', 'jieba', 'nltk'],
     test_suite="tests"
 )
diff --git a/tests/extractors/authors.py b/tests/extractors/authors.py
index 709040c1..a21d362e 100644
--- a/tests/extractors/authors.py
+++ b/tests/extractors/authors.py
@@ -21,12 +21,26 @@
 limitations under the License.
 """
 
-from base import TestExtractionBase
+from __future__ import absolute_import
+
+from .base import TestExtractionBase
 
 
 class TestArticleAuthor(TestExtractionBase):
 
     def test_author_schema(self):
         article = self.getArticle()
-        fields = ['authors']
-        self.runArticleAssertions(article=article, fields=fields)
+        field = 'authors'
+
+        # Do not call self.runArticleAssertions because need to sort results,
+        # because set not save ordering, so test failed;
+
+        expected_value = self.data['expected'][field]
+        result_value = getattr(article, field, None)
+
+        expected_value.sort()
+        result_value.sort()
+
+        # default assertion
+        msg = u"Error %s \nexpected: %s\nresult: %s" % (field, expected_value, result_value)
+        self.assertEqual(expected_value, result_value, msg=msg)
diff --git a/tests/extractors/base.py b/tests/extractors/base.py
index e19d20e0..72d4c601 100644
--- a/tests/extractors/base.py
+++ b/tests/extractors/base.py
@@ -22,11 +22,18 @@
 """
 import os
 import json
-import urllib2
 import unittest
 import socket
 
-from StringIO import StringIO
+try:
+    import urllib2
+except ImportError:
+    import urllib.request as urllib2
+
+try:
+    from StringIO import StringIO
+except ImportError:
+    from io import StringIO
 
 from goose import Goose
 from goose.utils import FileHelper
diff --git a/tests/extractors/content.py b/tests/extractors/content.py
index 30dc2754..854c4bd1 100644
--- a/tests/extractors/content.py
+++ b/tests/extractors/content.py
@@ -20,7 +20,9 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 """
-from base import TestExtractionBase
+from __future__ import absolute_import
+
+from .base import TestExtractionBase
 
 from goose.text import StopWordsChinese
 from goose.text import StopWordsArabic
diff --git a/tests/extractors/images.py b/tests/extractors/images.py
index e47a1dde..896d6985 100644
--- a/tests/extractors/images.py
+++ b/tests/extractors/images.py
@@ -20,13 +20,15 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 """
+from __future__ import absolute_import
+
 import os
 import json
 import hashlib
 import unittest
 
-from base import MockResponse
-from base import TestExtractionBase
+from .base import MockResponse
+from .base import TestExtractionBase
 
 from goose.configuration import Configuration
 from goose.image import Image
diff --git a/tests/extractors/links.py b/tests/extractors/links.py
index 8539465e..ea15a459 100644
--- a/tests/extractors/links.py
+++ b/tests/extractors/links.py
@@ -21,7 +21,9 @@
 limitations under the License.
 """
 
-from base import TestExtractionBase
+from __future__ import absolute_import
+
+from .base import TestExtractionBase
 
 
 class TestArticleLinks(TestExtractionBase):
diff --git a/tests/extractors/metas.py b/tests/extractors/metas.py
index fd45915a..a4eef74c 100644
--- a/tests/extractors/metas.py
+++ b/tests/extractors/metas.py
@@ -21,7 +21,9 @@
 limitations under the License.
 """
 
-from base import TestExtractionBase
+from __future__ import absolute_import
+
+from .base import TestExtractionBase
 
 
 class TestMetas(TestExtractionBase):
diff --git a/tests/extractors/opengraph.py b/tests/extractors/opengraph.py
index 415a784c..a0616227 100644
--- a/tests/extractors/opengraph.py
+++ b/tests/extractors/opengraph.py
@@ -21,7 +21,9 @@
 limitations under the License.
 """
 
-from base import TestExtractionBase
+from __future__ import absolute_import
+
+from .base import TestExtractionBase
 
 
 class TestOpenGraph(TestExtractionBase):
diff --git a/tests/extractors/publishdate.py b/tests/extractors/publishdate.py
index 8d2a13b9..355250d5 100644
--- a/tests/extractors/publishdate.py
+++ b/tests/extractors/publishdate.py
@@ -21,7 +21,9 @@
 limitations under the License.
 """
 
-from base import TestExtractionBase
+from __future__ import absolute_import
+
+from .base import TestExtractionBase
 
 
 class TestPublishDate(TestExtractionBase):
diff --git a/tests/extractors/tags.py b/tests/extractors/tags.py
index 22b17129..2f5562ba 100644
--- a/tests/extractors/tags.py
+++ b/tests/extractors/tags.py
@@ -21,7 +21,9 @@
 limitations under the License.
 """
 
-from base import TestExtractionBase
+from __future__ import absolute_import
+
+from .base import TestExtractionBase
 
 
 class TestArticleTags(TestExtractionBase):
diff --git a/tests/extractors/title.py b/tests/extractors/title.py
index 09170205..c6f7813c 100644
--- a/tests/extractors/title.py
+++ b/tests/extractors/title.py
@@ -21,7 +21,9 @@
 limitations under the License.
 """
 
-from base import TestExtractionBase
+from __future__ import absolute_import
+
+from .base import TestExtractionBase
 
 
 class TestTitle(TestExtractionBase):
diff --git a/tests/extractors/tweets.py b/tests/extractors/tweets.py
index 50300f43..3f72a604 100644
--- a/tests/extractors/tweets.py
+++ b/tests/extractors/tweets.py
@@ -20,8 +20,9 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 """
+from __future__ import absolute_import
 
-from base import TestExtractionBase
+from .base import TestExtractionBase
 
 
 class TestArticleTweet(TestExtractionBase):
diff --git a/tests/extractors/videos.py b/tests/extractors/videos.py
index 10be15ff..0350c8c3 100644
--- a/tests/extractors/videos.py
+++ b/tests/extractors/videos.py
@@ -20,7 +20,9 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 """
-from base import TestExtractionBase
+from __future__ import absolute_import
+
+from .base import TestExtractionBase
 
 
 class ImageExtractionTests(TestExtractionBase):
diff --git a/tests/parsers.py b/tests/parsers.py
index 6614368d..7b47d89e 100644
--- a/tests/parsers.py
+++ b/tests/parsers.py
@@ -22,6 +22,7 @@
 """
 import os
 import unittest
+import sys
 
 from goose.utils import FileHelper
 from goose.parsers import Parser
@@ -260,5 +261,7 @@ class TestParser(ParserBase):
 
 
 class TestParserSoup(ParserBase):
+
+    @unittest.skipIf(sys.version_info.major != 2, "supported only in python2")
     def setUp(self):
         self.parser = ParserSoup

From 6d9156595ffa093febad6e137f877add162e269a Mon Sep 17 00:00:00 2001
From: Shevchenko Vitaliy <vetal4444@gmail.com>
Date: Wed, 8 Apr 2015 19:19:53 +0300
Subject: [PATCH 02/14] Update requirements

---
 requirements.txt | 3 ++-
 setup.py         | 9 ++++++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 7e6a6c09..bbd377ee 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,5 +2,6 @@ Pillow
 lxml
 cssselect
 jieba
-beautifulsoup
+beautifulsoup  # Only on python2
 nltk
+six
diff --git a/setup.py b/setup.py
index fbe60081..df367682 100644
--- a/setup.py
+++ b/setup.py
@@ -22,6 +22,8 @@
 """
 
 import os
+import sys
+
 from setuptools import setup, find_packages
 from imp import load_source
 
@@ -53,6 +55,11 @@
 except Exception:
     long_description = description
 
+requirements = ['Pillow', 'lxml', 'cssselect', 'jieba', 'nltk', 'six']
+if sys.version_info.major == 2:
+    requirements.append('beautifulsoup')
+
+
 setup(name='goose-extractor',
     version=version.__version__,
     description=description,
@@ -66,6 +73,6 @@
     packages=find_packages(),
     include_package_data=True,
     zip_safe=False,
-    install_requires=['Pillow', 'lxml', 'cssselect', 'jieba', 'nltk'],
+    install_requires=requirements,
     test_suite="tests"
 )

From 79a12dd349efdb77c2762e231b8dc6e5c4166a2b Mon Sep 17 00:00:00 2001
From: Shevchenko Vitaliy <vetal4444@gmail.com>
Date: Thu, 9 Apr 2015 11:58:29 +0300
Subject: [PATCH 03/14] Add python3 to CLASSIFIERS

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index df367682..ab03c825 100644
--- a/setup.py
+++ b/setup.py
@@ -42,6 +42,7 @@
     'Programming Language :: Python :: 2',
     'Programming Language :: Python :: 2.6',
     'Programming Language :: Python :: 2.7',
+    'Programming Language :: Python :: 3',
     'Topic :: Internet',
     'Topic :: Utilities',
     'Topic :: Software Development :: Libraries :: Python Modules']

From 76af358635237859917fec2e8f81cb17c6004934 Mon Sep 17 00:00:00 2001
From: Shevchenko Vitaliy <vetal4444@gmail.com>
Date: Thu, 9 Apr 2015 12:21:19 +0300
Subject: [PATCH 04/14] Optimize imports

---
 goose/__init__.py          | 1 -
 goose/extractors/images.py | 6 +-----
 goose/extractors/metas.py  | 6 ++----
 goose/image.py             | 8 ++------
 goose/outputformatters.py  | 5 +----
 goose/utils/images.py      | 7 +++----
 goose/video.py             | 1 +
 tests/extractors/base.py   | 5 +----
 8 files changed, 11 insertions(+), 28 deletions(-)

diff --git a/goose/__init__.py b/goose/__init__.py
index f267fa34..d1cd6da8 100644
--- a/goose/__init__.py
+++ b/goose/__init__.py
@@ -21,7 +21,6 @@
 limitations under the License.
 """
 import os
-import platform
 from tempfile import mkstemp
 
 from goose.version import version_info, __version__
diff --git a/goose/extractors/images.py b/goose/extractors/images.py
index 2dd63786..f258aead 100644
--- a/goose/extractors/images.py
+++ b/goose/extractors/images.py
@@ -23,11 +23,7 @@
 import re
 import os
 
-try:
-    from urlparse import urlparse, urljoin
-except ImportError:
-    from urllib.parse import urlparse, urljoin
-
+from six.moves.urllib.parse import urlparse, urljoin
 
 from goose.extractors import BaseExtractor
 from goose.image import Image
diff --git a/goose/extractors/metas.py b/goose/extractors/metas.py
index ee7d520e..5a65aa16 100644
--- a/goose/extractors/metas.py
+++ b/goose/extractors/metas.py
@@ -22,10 +22,8 @@
 """
 
 import re
-try:
-    from urlparse import urlparse, urljoin
-except ImportError:
-    from urllib.parse import urlparse, urljoin
+
+from six.moves.urllib.parse import urlparse, urljoin
 
 from goose.extractors import BaseExtractor
 
diff --git a/goose/image.py b/goose/image.py
index 23026398..58ddd021 100644
--- a/goose/image.py
+++ b/goose/image.py
@@ -20,10 +20,6 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 """
-try:
-    long
-except NameError:
-    long = int
 
 
 class Image(object):
@@ -50,7 +46,7 @@ def __init__(self):
         self.extraction_type = "NA"
 
         # stores how many bytes this image is.
-        self.bytes = long(0)
+        self.bytes = 0
 
     def get_src(self):
         return self.src
@@ -91,7 +87,7 @@ def set_mime_type(self, mime_type):
 class LocallyStoredImage(object):
 
     def __init__(self, src='', local_filename='',
-        link_hash='', bytes=long(0), file_extension='', height=0, width=0):
+                 link_hash='', bytes=0, file_extension='', height=0, width=0):
         self.src = src
         self.local_filename = local_filename
         self.link_hash = link_hash
diff --git a/goose/outputformatters.py b/goose/outputformatters.py
index 21dab451..808f2eee 100644
--- a/goose/outputformatters.py
+++ b/goose/outputformatters.py
@@ -20,10 +20,7 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 """
-try:
-    from HTMLParser import HTMLParser
-except ImportError:
-    from html.parser import HTMLParser
+from six.moves.html_parser import HTMLParser
 
 from goose.text import innerTrim
 
diff --git a/goose/utils/images.py b/goose/utils/images.py
index 76a8c72f..92d5a133 100644
--- a/goose/utils/images.py
+++ b/goose/utils/images.py
@@ -22,10 +22,9 @@
 """
 import hashlib
 import os
-try:
-    from urllib2 import urlopen, Request
-except ImportError:
-    from urllib.request import urlopen, Request
+
+from six.moves.urllib.request import urlopen, Request
+
 from PIL import Image
 
 from goose.utils.encoding import smart_str
diff --git a/goose/video.py b/goose/video.py
index 8509bba0..0691ac96 100644
--- a/goose/video.py
+++ b/goose/video.py
@@ -21,6 +21,7 @@
 limitations under the License.
 """
 
+
 class Video(object):
     """\
     Video object
diff --git a/tests/extractors/base.py b/tests/extractors/base.py
index 72d4c601..a0849e35 100644
--- a/tests/extractors/base.py
+++ b/tests/extractors/base.py
@@ -30,10 +30,7 @@
 except ImportError:
     import urllib.request as urllib2
 
-try:
-    from StringIO import StringIO
-except ImportError:
-    from io import StringIO
+from six import StringIO
 
 from goose import Goose
 from goose.utils import FileHelper

From f44c2af9e6eee9ac21f45612f1a6b76ee3682cd0 Mon Sep 17 00:00:00 2001
From: Shevchenko Vitaliy <vetal4444@gmail.com>
Date: Thu, 9 Apr 2015 12:37:33 +0300
Subject: [PATCH 05/14] Restore python 2.6 support

---
 setup.py         | 2 +-
 tests/parsers.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index ab03c825..c046ed82 100644
--- a/setup.py
+++ b/setup.py
@@ -57,7 +57,7 @@
     long_description = description
 
 requirements = ['Pillow', 'lxml', 'cssselect', 'jieba', 'nltk', 'six']
-if sys.version_info.major == 2:
+if sys.version_info[0] == 2:
     requirements.append('beautifulsoup')
 
 
diff --git a/tests/parsers.py b/tests/parsers.py
index 7b47d89e..812cc6b2 100644
--- a/tests/parsers.py
+++ b/tests/parsers.py
@@ -22,7 +22,7 @@
 """
 import os
 import unittest
-import sys
+import six
 
 from goose.utils import FileHelper
 from goose.parsers import Parser
@@ -262,6 +262,6 @@ class TestParser(ParserBase):
 
 class TestParserSoup(ParserBase):
 
-    @unittest.skipIf(sys.version_info.major != 2, "supported only in python2")
+    @unittest.skipIf(six.PY3, "supported only in python2")
     def setUp(self):
         self.parser = ParserSoup

From 2e18083b9f52a903e53e5bbe30b55d28b37d181b Mon Sep 17 00:00:00 2001
From: Shevchenko Vitaliy <vetal4444@gmail.com>
Date: Thu, 9 Apr 2015 13:06:08 +0300
Subject: [PATCH 06/14] Try to fix tests in python 2.6

---
 setup.py         | 2 ++
 tests/parsers.py | 6 +++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index c046ed82..bce19c5c 100644
--- a/setup.py
+++ b/setup.py
@@ -59,6 +59,8 @@
 requirements = ['Pillow', 'lxml', 'cssselect', 'jieba', 'nltk', 'six']
 if sys.version_info[0] == 2:
     requirements.append('beautifulsoup')
+    if sys.version_info[1] < 7:
+        requirements.append('unittest2')
 
 
 setup(name='goose-extractor',
diff --git a/tests/parsers.py b/tests/parsers.py
index 812cc6b2..41aa5934 100644
--- a/tests/parsers.py
+++ b/tests/parsers.py
@@ -21,7 +21,11 @@
 limitations under the License.
 """
 import os
-import unittest
+try:
+    import unittest2 as unittest  # Need to support skipIf in python 2.6
+except ImportError:
+    import unittest
+
 import six
 
 from goose.utils import FileHelper

From b7884f1c3e219dd2c4fa473a10f0aba80618e628 Mon Sep 17 00:00:00 2001
From: Shevchenko Vitaliy <vetal4444@gmail.com>
Date: Wed, 15 Apr 2015 17:09:05 +0300
Subject: [PATCH 07/14] Fix smart_str

---
 goose/utils/encoding.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/goose/utils/encoding.py b/goose/utils/encoding.py
index eb98917c..25022704 100644
--- a/goose/utils/encoding.py
+++ b/goose/utils/encoding.py
@@ -115,9 +115,13 @@ def smart_str(s, encoding='utf-8', strings_only=False, errors='strict'):
         return s
     # if isinstance(s, Promise):
     #     return unicode(s).encode(encoding, errors)
-    if not isinstance(s, six.string_types):
+    if isinstance(s, six.text_type):
+        return s.encode(encoding, errors)
+    elif not isinstance(s, (six.binary_type, six.string_types)):
         try:
-            return str(s)
+            if six.PY2:
+                return str(s)
+            return str(s).encode(encoding, errors)
         except UnicodeEncodeError:
             if isinstance(s, Exception):
                 # An Exception subclass containing non-ASCII data that doesn't
@@ -126,8 +130,6 @@ def smart_str(s, encoding='utf-8', strings_only=False, errors='strict'):
                 return ' '.join([smart_str(arg, encoding, strings_only,
                         errors) for arg in s])
             return six.text_type(s).encode(encoding, errors)
-    elif isinstance(s, six.text_type):
-        return s.encode(encoding, errors)
     elif s and encoding != 'utf-8':
         return s.decode('utf-8', errors).encode(encoding, errors)
     else:

From 90287612f6517ab07655acfe28cf43fd16ef8a1b Mon Sep 17 00:00:00 2001
From: Shevchenko Vitaliy <vetal4444@gmail.com>
Date: Wed, 15 Apr 2015 17:10:30 +0300
Subject: [PATCH 08/14] Fix ValueError if we get document with set encoding.
 Add test case for this.

---
 goose/parsers.py |  3 +--
 goose/text.py    | 34 +++++++++++++++++++++++++++++++---
 tests/parsers.py | 15 +++++++++++++++
 3 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/goose/parsers.py b/goose/parsers.py
index c0b091a9..61d6510e 100644
--- a/goose/parsers.py
+++ b/goose/parsers.py
@@ -26,8 +26,7 @@
 
 from lxml import etree
 from copy import deepcopy
-from goose.text import innerTrim
-from goose.text import encodeValue
+from goose.text import innerTrim, encodeValue, get_encodings_from_content
 
 
 class Parser(object):
diff --git a/goose/text.py b/goose/text.py
index 02846e20..960d0608 100644
--- a/goose/text.py
+++ b/goose/text.py
@@ -34,6 +34,28 @@
 TABSSPACE = re.compile(r'[\s\t]+')
 
 
+def get_encodings_from_content(content):
+    """
+    Code from:
+    https://github.com/sigmavirus24/requests-toolbelt/blob/master/requests_toolbelt/utils/deprecated.py
+    Return encodings from given content string.
+    :param content: string to extract encodings from.
+    """
+    find_charset = re.compile(
+        r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I
+    ).findall
+
+    find_pragma = re.compile(
+        r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I
+    ).findall
+
+    find_xml = re.compile(
+        r'^<\?xml.*?encoding=["\']*(.+?)["\'>]'
+    ).findall
+
+    return find_charset(content) + find_pragma(content) + find_xml(content)
+
+
 def innerTrim(value):
     if isinstance(value, (six.text_type, six.string_types)):
         # remove tab and white space
@@ -46,9 +68,15 @@ def innerTrim(value):
 def encodeValue(value):
     string_org = value
     try:
-        value = smart_unicode(value)
-    except (UnicodeEncodeError, DjangoUnicodeDecodeError):
-        value = smart_str(value)
+        encoding = get_encodings_from_content(value)
+        if encoding:
+            # If encoding is set we must pass bytes to lxml.html.fromstring or will get exception;
+            value = smart_str(value)
+        else:
+            try:
+                value = smart_unicode(value)
+            except (UnicodeEncodeError, DjangoUnicodeDecodeError):
+                value = smart_str(value)
     except Exception:
         value = string_org
     return value
diff --git a/tests/parsers.py b/tests/parsers.py
index 41aa5934..e5f17164 100644
--- a/tests/parsers.py
+++ b/tests/parsers.py
@@ -259,6 +259,21 @@ def test_delAttribute(self):
         # remove an unexistant attribute
         self.parser.delAttribute(div,  attr="bla")
 
+    def test_encoding(self):
+        """
+        If pass unicode string to lxml.html.fromstring with encoding set in document will receive:
+        "ValueError: Unicode strings with encoding declaration are not supported.
+        Please use bytes input or XML fragments without declaration."
+        Test for this case.
+        """
+        html = u"""<?xml version="1.0" encoding="utf-8"?>
+        <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+        """
+        html += u'<html><body>'
+        html += u'<p>Я рядочок</p>'
+        html += u'</body></html>'
+        self.parser.fromstring(html)
+
 
 class TestParser(ParserBase):
     pass

From 74743ab334af9c9c64e17c92227ceaca2fbc3ba9 Mon Sep 17 00:00:00 2001
From: Shevchenko Vitaliy <vetal4444@gmail.com>
Date: Wed, 15 Apr 2015 18:55:11 +0300
Subject: [PATCH 09/14] Add py 3.4 to travis.yml

---
 .travis.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.travis.yml b/.travis.yml
index 2f2c722e..4b341e25 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -3,6 +3,7 @@ language: python
 python:
     - 2.6
     - 2.7
+    - 3.4
 
 install:
     - pip install -r requirements.txt --use-mirrors

From 5fbc788bd4340edaeb64012b1e6c1b8141bbf1cf Mon Sep 17 00:00:00 2001
From: Shevchenko Vitaliy <vetal4444@gmail.com>
Date: Wed, 15 Apr 2015 22:32:37 +0300
Subject: [PATCH 10/14] Remove install from requirements file in travis.yml as
 no way to avoid install bs3 under py3

---
 .travis.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 4b341e25..a242d0ac 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -6,7 +6,7 @@ python:
     - 3.4
 
 install:
-    - pip install -r requirements.txt --use-mirrors
+    - pip install jieba
     - python setup.py install
 
 script: python setup.py test

From 1d029324a11305d9c93eae5181c33fda17c757cc Mon Sep 17 00:00:00 2001
From: Shevchenko Vitaliy <vetal4444@gmail.com>
Date: Wed, 15 Apr 2015 22:32:57 +0300
Subject: [PATCH 11/14] Close image file after use

---
 goose/utils/images.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/goose/utils/images.py b/goose/utils/images.py
index 92d5a133..31a55d61 100644
--- a/goose/utils/images.py
+++ b/goose/utils/images.py
@@ -38,9 +38,9 @@ class ImageUtils(object):
     def get_image_dimensions(self, identify_program, path):
         image_details = ImageDetails()
         try:
-            image = Image.open(path)
-            image_details.set_mime_type(image.format)
-            width, height = image.size
+            with Image.open(path) as image:
+                image_details.set_mime_type(image.format)
+                width, height = image.size
             image_details.set_width(width)
             image_details.set_height(height)
         except IOError:

From 9091e3827cae539b3bacd2cabd5eb3f4c39225ea Mon Sep 17 00:00:00 2001
From: Shevchenko Vitaliy <vetal4444@gmail.com>
Date: Wed, 15 Apr 2015 22:35:10 +0300
Subject: [PATCH 12/14] Fix tests

---
 goose/parsers.py | 13 ++++++++++---
 goose/text.py    | 12 +++---------
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/goose/parsers.py b/goose/parsers.py
index 61d6510e..fab3eb31 100644
--- a/goose/parsers.py
+++ b/goose/parsers.py
@@ -26,7 +26,7 @@
 
 from lxml import etree
 from copy import deepcopy
-from goose.text import innerTrim, encodeValue, get_encodings_from_content
+from goose.text import innerTrim, encodeValue, get_encodings_from_content, smart_str
 
 
 class Parser(object):
@@ -51,8 +51,15 @@ def css_select(self, node, selector):
 
     @classmethod
     def fromstring(self, html):
-        html = encodeValue(html)
-        self.doc = lxml.html.fromstring(html)
+        encoding = get_encodings_from_content(html)
+        encoding = encoding and encoding[0] or None
+        if not encoding:
+            html = encodeValue(html)
+            self.doc = lxml.html.fromstring(html)
+        else:
+            html = smart_str(html, encoding=encoding)
+            parser = lxml.html.HTMLParser(encoding=encoding)
+            self.doc = lxml.html.fromstring(html, parser=parser)
         return self.doc
 
     @classmethod
diff --git a/goose/text.py b/goose/text.py
index 960d0608..343fdbc2 100644
--- a/goose/text.py
+++ b/goose/text.py
@@ -68,15 +68,9 @@ def innerTrim(value):
 def encodeValue(value):
     string_org = value
     try:
-        encoding = get_encodings_from_content(value)
-        if encoding:
-            # If encoding is set we must pass bytes to lxml.html.fromstring or will get exception;
-            value = smart_str(value)
-        else:
-            try:
-                value = smart_unicode(value)
-            except (UnicodeEncodeError, DjangoUnicodeDecodeError):
-                value = smart_str(value)
+        value = smart_unicode(value)
+    except (UnicodeEncodeError, DjangoUnicodeDecodeError):
+        value = smart_str(value)
     except Exception:
         value = string_org
     return value

From 8fa55b4ebc41b2ebda2edfb309c01719769c9549 Mon Sep 17 00:00:00 2001
From: Shevchenko Vitaliy <vetal4444@gmail.com>
Date: Thu, 16 Apr 2015 21:57:27 +0300
Subject: [PATCH 13/14] Fix encoding detection

---
 goose/text.py           | 36 ++++++++++++++++++++++++------------
 goose/utils/encoding.py |  4 +---
 2 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/goose/text.py b/goose/text.py
index 343fdbc2..31070cf0 100644
--- a/goose/text.py
+++ b/goose/text.py
@@ -41,18 +41,30 @@ def get_encodings_from_content(content):
     Return encodings from given content string.
     :param content: string to extract encodings from.
     """
-    find_charset = re.compile(
-        r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I
-    ).findall
-
-    find_pragma = re.compile(
-        r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I
-    ).findall
-
-    find_xml = re.compile(
-        r'^<\?xml.*?encoding=["\']*(.+?)["\'>]'
-    ).findall
-
+    if isinstance(content, six.binary_type) and six.PY3:
+        find_charset = re.compile(
+            br'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I
+        ).findall
+
+        find_pragma = re.compile(
+            br'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I
+        ).findall
+
+        find_xml = re.compile(
+            br'^<\?xml.*?encoding=["\']*(.+?)["\'>]'
+        ).findall
+    else:
+        find_charset = re.compile(
+            r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I
+        ).findall
+
+        find_pragma = re.compile(
+            r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I
+        ).findall
+
+        find_xml = re.compile(
+            r'^<\?xml.*?encoding=["\']*(.+?)["\'>]'
+        ).findall
     return find_charset(content) + find_pragma(content) + find_xml(content)
 
 
diff --git a/goose/utils/encoding.py b/goose/utils/encoding.py
index 25022704..f94f476e 100644
--- a/goose/utils/encoding.py
+++ b/goose/utils/encoding.py
@@ -117,7 +117,7 @@ def smart_str(s, encoding='utf-8', strings_only=False, errors='strict'):
     #     return unicode(s).encode(encoding, errors)
     if isinstance(s, six.text_type):
         return s.encode(encoding, errors)
-    elif not isinstance(s, (six.binary_type, six.string_types)):
+    elif not isinstance(s, six.binary_type):
         try:
             if six.PY2:
                 return str(s)
@@ -130,7 +130,5 @@ def smart_str(s, encoding='utf-8', strings_only=False, errors='strict'):
                 return ' '.join([smart_str(arg, encoding, strings_only,
                         errors) for arg in s])
             return six.text_type(s).encode(encoding, errors)
-    elif s and encoding != 'utf-8':
-        return s.decode('utf-8', errors).encode(encoding, errors)
     else:
         return s

From 1ef277b5784887e93c952a4a7cf5ce3a5fb993b4 Mon Sep 17 00:00:00 2001
From: Shevchenko Vitaliy <vetal4444@gmail.com>
Date: Thu, 16 Apr 2015 21:57:44 +0300
Subject: [PATCH 14/14] Fix test runner under py3

---
 tests/extractors/base.py   | 10 +++++++---
 tests/extractors/images.py |  2 +-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/tests/extractors/base.py b/tests/extractors/base.py
index a0849e35..93b3c075 100644
--- a/tests/extractors/base.py
+++ b/tests/extractors/base.py
@@ -30,7 +30,8 @@
 except ImportError:
     import urllib.request as urllib2
 
-from six import StringIO
+import six
+from six import StringIO, BytesIO
 
 from goose import Goose
 from goose.utils import FileHelper
@@ -51,13 +52,16 @@ class MockResponse():
     def __init__(self, cls):
         self.cls = cls
 
-    def content(self):
+    def content(self, req):
         return "response"
 
     def response(self, req):
         data = self.content(req)
         url = req.get_full_url()
-        resp = urllib2.addinfourl(StringIO(data), data, url)
+        if isinstance(data, six.binary_type):
+            resp = urllib2.addinfourl(BytesIO(data), data, url)
+        else:
+            resp = urllib2.addinfourl(StringIO(data), data, url)
         resp.code = self.code
         resp.msg = self.msg
         return resp
diff --git a/tests/extractors/images.py b/tests/extractors/images.py
index 896d6985..9a9712a1 100644
--- a/tests/extractors/images.py
+++ b/tests/extractors/images.py
@@ -43,7 +43,7 @@
 class MockResponseImage(MockResponse):
 
     def image_content(self, req):
-        md5_hash = hashlib.md5(req.get_full_url()).hexdigest()
+        md5_hash = hashlib.md5(req.get_full_url().encode("utf-8")).hexdigest()
         current_test = self.cls._get_current_testname()
         path = os.path.join(
                 os.path.dirname(CURRENT_PATH),