From 94b1a20854cee30f2d4b282de1ca68c3ba591c73 Mon Sep 17 00:00:00 2001 From: Shevchenko Vitaliy Date: Wed, 8 Apr 2015 19:14:47 +0300 Subject: [PATCH 01/14] Add python3 support --- goose/__init__.py | 9 ++++++--- goose/cleaners.py | 2 ++ goose/configuration.py | 9 +++++++-- goose/extractors/content.py | 2 +- goose/extractors/images.py | 6 +++++- goose/extractors/metas.py | 6 ++++-- goose/image.py | 4 ++++ goose/network.py | 17 +++++++++-------- goose/outputformatters.py | 6 +++++- goose/parsers.py | 7 +++++-- goose/text.py | 13 ++++++++----- goose/utils/__init__.py | 13 ++++++++++--- goose/utils/encoding.py | 28 +++++++++++++++------------- goose/utils/images.py | 10 +++++++--- setup.py | 2 +- tests/extractors/authors.py | 20 +++++++++++++++++--- tests/extractors/base.py | 11 +++++++++-- tests/extractors/content.py | 4 +++- tests/extractors/images.py | 6 ++++-- tests/extractors/links.py | 4 +++- tests/extractors/metas.py | 4 +++- tests/extractors/opengraph.py | 4 +++- tests/extractors/publishdate.py | 4 +++- tests/extractors/tags.py | 4 +++- tests/extractors/title.py | 4 +++- tests/extractors/tweets.py | 3 ++- tests/extractors/videos.py | 4 +++- tests/parsers.py | 3 +++ 28 files changed, 148 insertions(+), 61 deletions(-) diff --git a/goose/__init__.py b/goose/__init__.py index 409b5732..f267fa34 100644 --- a/goose/__init__.py +++ b/goose/__init__.py @@ -64,9 +64,12 @@ def crawl(self, crawl_candiate): try: crawler = Crawler(self.config) article = crawler.crawl(crawl_candiate) - except (UnicodeDecodeError, ValueError): - self.config.parser_class = parsers[0] - return self.crawl(crawl_candiate) + except (UnicodeDecodeError, ValueError) as e: + if parsers: + self.config.parser_class = parsers[0] + return self.crawl(crawl_candiate) + else: + raise e return article def initialize(self): diff --git a/goose/cleaners.py b/goose/cleaners.py index c1384ee0..9ab45b6d 100644 --- a/goose/cleaners.py +++ b/goose/cleaners.py @@ -20,6 +20,8 @@ See the License for the specific language governing permissions and limitations under the License. """ +from __future__ import unicode_literals + from goose.utils import ReplaceSequence diff --git a/goose/configuration.py b/goose/configuration.py index fcfa5b9a..4913f699 100644 --- a/goose/configuration.py +++ b/goose/configuration.py @@ -22,6 +22,9 @@ """ import os import tempfile + +import six + from goose.text import StopWords from goose.parsers import Parser from goose.parsers import ParserSoup @@ -30,10 +33,12 @@ HTTP_DEFAULT_TIMEOUT = 30 AVAILABLE_PARSERS = { - 'lxml': Parser, - 'soup': ParserSoup, + 'lxml': Parser } +if six.PY2: + AVAILABLE_PARSERS['soup'] = ParserSoup + class Configuration(object): diff --git a/goose/extractors/content.py b/goose/extractors/content.py index e0703d55..afdc2c91 100644 --- a/goose/extractors/content.py +++ b/goose/extractors/content.py @@ -260,7 +260,7 @@ def update_score(self, node, addToScore): if score_string: current_score = int(score_string) - new_score = current_score + addToScore + new_score = current_score + int(addToScore) self.parser.setAttribute(node, "gravityScore", str(new_score)) def update_node_count(self, node, add_to_count): diff --git a/goose/extractors/images.py b/goose/extractors/images.py index 3af44f5f..2dd63786 100644 --- a/goose/extractors/images.py +++ b/goose/extractors/images.py @@ -23,7 +23,11 @@ import re import os -from urlparse import urlparse, urljoin +try: + from urlparse import urlparse, urljoin +except ImportError: + from urllib.parse import urlparse, urljoin + from goose.extractors import BaseExtractor from goose.image import Image diff --git a/goose/extractors/metas.py b/goose/extractors/metas.py index 95acadd5..ee7d520e 100644 --- a/goose/extractors/metas.py +++ b/goose/extractors/metas.py @@ -22,8 +22,10 @@ """ import re -from urlparse import urljoin -from urlparse import urlparse +try: + from urlparse import urlparse, urljoin +except ImportError: + from urllib.parse import urlparse, urljoin from goose.extractors import BaseExtractor diff --git a/goose/image.py b/goose/image.py index 351e3396..23026398 100644 --- a/goose/image.py +++ b/goose/image.py @@ -20,6 +20,10 @@ See the License for the specific language governing permissions and limitations under the License. """ +try: + long +except NameError: + long = int class Image(object): diff --git a/goose/network.py b/goose/network.py index 666a7d61..2b8265ad 100644 --- a/goose/network.py +++ b/goose/network.py @@ -20,7 +20,12 @@ See the License for the specific language governing permissions and limitations under the License. """ -import urllib2 +import six + +try: + from urllib2 import urlopen, Request +except ImportError: + from urllib.request import urlopen, Request class HtmlFetcher(object): @@ -39,18 +44,14 @@ def get_url(self): def get_html(self, url): # utf-8 encode unicode url - if isinstance(url, unicode): + if isinstance(url, six.text_type) and six.PY2: url = url.encode('utf-8') # set request - self.request = urllib2.Request( - url, - headers=self.headers) + self.request = Request(url, headers=self.headers) # do request try: - self.result = urllib2.urlopen( - self.request, - timeout=self.config.http_timeout) + self.result = urlopen(self.request, timeout=self.config.http_timeout) except Exception: self.result = None diff --git a/goose/outputformatters.py b/goose/outputformatters.py index 1f8ba4bd..21dab451 100644 --- a/goose/outputformatters.py +++ b/goose/outputformatters.py @@ -20,7 +20,11 @@ See the License for the specific language governing permissions and limitations under the License. """ -from HTMLParser import HTMLParser +try: + from HTMLParser import HTMLParser +except ImportError: + from html.parser import HTMLParser + from goose.text import innerTrim diff --git a/goose/parsers.py b/goose/parsers.py index a43e9b47..c0b091a9 100644 --- a/goose/parsers.py +++ b/goose/parsers.py @@ -21,7 +21,9 @@ limitations under the License. """ import lxml.html -from lxml.html import soupparser + +import six + from lxml import etree from copy import deepcopy from goose.text import innerTrim @@ -56,7 +58,7 @@ def fromstring(self, html): @classmethod def nodeToString(self, node): - return etree.tostring(node) + return etree.tostring(node, encoding=six.text_type) @classmethod def replaceTag(self, node, tag): @@ -239,6 +241,7 @@ class ParserSoup(Parser): @classmethod def fromstring(self, html): + from lxml.html import soupparser html = encodeValue(html) self.doc = soupparser.fromstring(html) return self.doc diff --git a/goose/text.py b/goose/text.py index 3ef63d6b..02846e20 100644 --- a/goose/text.py +++ b/goose/text.py @@ -23,6 +23,9 @@ import os import re import string + +import six + from goose.utils import FileHelper from goose.utils.encoding import smart_unicode from goose.utils.encoding import smart_str @@ -32,7 +35,7 @@ def innerTrim(value): - if isinstance(value, (unicode, str)): + if isinstance(value, (six.text_type, six.string_types)): # remove tab and white space value = re.sub(TABSSPACE, ' ', value) value = ''.join(value.splitlines()) @@ -87,7 +90,6 @@ def set_word_count(self, cnt): class StopWords(object): PUNCTUATION = re.compile("[^\\p{Ll}\\p{Lu}\\p{Lt}\\p{Lo}\\p{Nd}\\p{Pc}\\s]") - TRANS_TABLE = string.maketrans('', '') _cached_stop_words = {} def __init__(self, language='en'): @@ -106,9 +108,10 @@ def __init__(self, language='en'): def remove_punctuation(self, content): # code taken form # http://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string-in-python - if isinstance(content, unicode): - content = content.encode('utf-8') - return content.translate(self.TRANS_TABLE, string.punctuation) + if not isinstance(content, six.text_type): + content = content.decode('utf-8') + tbl = dict.fromkeys(ord(x) for x in string.punctuation) + return content.translate(tbl) def candiate_words(self, stripped_input): return stripped_input.split(' ') diff --git a/goose/utils/__init__.py b/goose/utils/__init__.py index 5a1de7d4..41cf9c95 100644 --- a/goose/utils/__init__.py +++ b/goose/utils/__init__.py @@ -26,7 +26,13 @@ import os import goose import codecs -import urlparse + +import six + +try: + from urlparse import urlparse +except ImportError: + from urllib.parse import urlparse class BuildURL(object): @@ -89,7 +95,7 @@ def __init__(self, urlString, link_hash): class RawHelper(object): @classmethod def get_parsing_candidate(self, url, raw_html): - if isinstance(raw_html, unicode): + if isinstance(raw_html, six.text_type): raw_html = raw_html.encode('utf-8') link_hash = '%s.%s' % (hashlib.md5(raw_html).hexdigest(), time.time()) return ParsingCandidate(url, link_hash) @@ -101,7 +107,8 @@ def get_parsing_candidate(self, url_to_crawl): # replace shebang is urls final_url = url_to_crawl.replace('#!', '?_escaped_fragment_=') \ if '#!' in url_to_crawl else url_to_crawl - link_hash = '%s.%s' % (hashlib.md5(final_url).hexdigest(), time.time()) + url = final_url.encode("utf-8") if isinstance(final_url, six.text_type) else final_url + link_hash = '%s.%s' % (hashlib.md5(url).hexdigest(), time.time()) return ParsingCandidate(final_url, link_hash) diff --git a/goose/utils/encoding.py b/goose/utils/encoding.py index 4dc23ca7..eb98917c 100644 --- a/goose/utils/encoding.py +++ b/goose/utils/encoding.py @@ -1,6 +1,8 @@ # -*- coding: utf-8 -*- -import types import datetime + +import six + from decimal import Decimal @@ -45,8 +47,8 @@ def is_protected_type(obj): force_unicode(strings_only=True). """ return isinstance(obj, ( - types.NoneType, - int, long, + type(None), + six.integer_types, datetime.datetime, datetime.date, datetime.time, float, Decimal) ) @@ -62,17 +64,17 @@ def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'): # Handle the common case first, saves 30-40% in performance when s # is an instance of unicode. This function gets called often in that # setting. - if isinstance(s, unicode): + if isinstance(s, six.text_type): return s if strings_only and is_protected_type(s): return s try: - if not isinstance(s, basestring,): + if not isinstance(s, six.string_types,): if hasattr(s, '__unicode__'): - s = unicode(s) + s = s.__unicode__() else: try: - s = unicode(str(s), encoding, errors) + s = six.text_type(s, encoding, errors) except UnicodeEncodeError: if not isinstance(s, Exception): raise @@ -84,12 +86,12 @@ def force_unicode(s, encoding='utf-8', strings_only=False, errors='strict'): # output should be. s = u' '.join([force_unicode(arg, encoding, strings_only, errors) for arg in s]) - elif not isinstance(s, unicode): + elif not isinstance(s, six.text_type): # Note: We use .decode() here, instead of unicode(s, encoding, # errors), so that if s is a SafeString, it ends up being a # SafeUnicode at the end. s = s.decode(encoding, errors) - except UnicodeDecodeError, e: + except UnicodeDecodeError as e: if not isinstance(s, Exception): raise DjangoUnicodeDecodeError(s, *e.args) else: @@ -109,11 +111,11 @@ def smart_str(s, encoding='utf-8', strings_only=False, errors='strict'): If strings_only is True, don't convert (some) non-string-like objects. """ - if strings_only and isinstance(s, (types.NoneType, int)): + if strings_only and isinstance(s, (type(None), int)): return s # if isinstance(s, Promise): # return unicode(s).encode(encoding, errors) - if not isinstance(s, basestring): + if not isinstance(s, six.string_types): try: return str(s) except UnicodeEncodeError: @@ -123,8 +125,8 @@ def smart_str(s, encoding='utf-8', strings_only=False, errors='strict'): # further exception. return ' '.join([smart_str(arg, encoding, strings_only, errors) for arg in s]) - return unicode(s).encode(encoding, errors) - elif isinstance(s, unicode): + return six.text_type(s).encode(encoding, errors) + elif isinstance(s, six.text_type): return s.encode(encoding, errors) elif s and encoding != 'utf-8': return s.decode('utf-8', errors).encode(encoding, errors) diff --git a/goose/utils/images.py b/goose/utils/images.py index 388d5c85..76a8c72f 100644 --- a/goose/utils/images.py +++ b/goose/utils/images.py @@ -22,8 +22,12 @@ """ import hashlib import os -import urllib2 +try: + from urllib2 import urlopen, Request +except ImportError: + from urllib.request import urlopen, Request from PIL import Image + from goose.utils.encoding import smart_str from goose.image import ImageDetails from goose.image import LocallyStoredImage @@ -115,8 +119,8 @@ def clean_src_string(self, src): @classmethod def fetch(self, http_client, src): try: - req = urllib2.Request(src) - f = urllib2.urlopen(req) + req = Request(src) + f = urlopen(req) data = f.read() return data except Exception: diff --git a/setup.py b/setup.py index ebad2547..fbe60081 100644 --- a/setup.py +++ b/setup.py @@ -66,6 +66,6 @@ packages=find_packages(), include_package_data=True, zip_safe=False, - install_requires=['Pillow', 'lxml', 'cssselect', 'jieba', 'beautifulsoup', 'nltk'], + install_requires=['Pillow', 'lxml', 'cssselect', 'jieba', 'nltk'], test_suite="tests" ) diff --git a/tests/extractors/authors.py b/tests/extractors/authors.py index 709040c1..a21d362e 100644 --- a/tests/extractors/authors.py +++ b/tests/extractors/authors.py @@ -21,12 +21,26 @@ limitations under the License. """ -from base import TestExtractionBase +from __future__ import absolute_import + +from .base import TestExtractionBase class TestArticleAuthor(TestExtractionBase): def test_author_schema(self): article = self.getArticle() - fields = ['authors'] - self.runArticleAssertions(article=article, fields=fields) + field = 'authors' + + # Do not call self.runArticleAssertions because need to sort results, + # because set not save ordering, so test failed; + + expected_value = self.data['expected'][field] + result_value = getattr(article, field, None) + + expected_value.sort() + result_value.sort() + + # default assertion + msg = u"Error %s \nexpected: %s\nresult: %s" % (field, expected_value, result_value) + self.assertEqual(expected_value, result_value, msg=msg) diff --git a/tests/extractors/base.py b/tests/extractors/base.py index e19d20e0..72d4c601 100644 --- a/tests/extractors/base.py +++ b/tests/extractors/base.py @@ -22,11 +22,18 @@ """ import os import json -import urllib2 import unittest import socket -from StringIO import StringIO +try: + import urllib2 +except ImportError: + import urllib.request as urllib2 + +try: + from StringIO import StringIO +except ImportError: + from io import StringIO from goose import Goose from goose.utils import FileHelper diff --git a/tests/extractors/content.py b/tests/extractors/content.py index 30dc2754..854c4bd1 100644 --- a/tests/extractors/content.py +++ b/tests/extractors/content.py @@ -20,7 +20,9 @@ See the License for the specific language governing permissions and limitations under the License. """ -from base import TestExtractionBase +from __future__ import absolute_import + +from .base import TestExtractionBase from goose.text import StopWordsChinese from goose.text import StopWordsArabic diff --git a/tests/extractors/images.py b/tests/extractors/images.py index e47a1dde..896d6985 100644 --- a/tests/extractors/images.py +++ b/tests/extractors/images.py @@ -20,13 +20,15 @@ See the License for the specific language governing permissions and limitations under the License. """ +from __future__ import absolute_import + import os import json import hashlib import unittest -from base import MockResponse -from base import TestExtractionBase +from .base import MockResponse +from .base import TestExtractionBase from goose.configuration import Configuration from goose.image import Image diff --git a/tests/extractors/links.py b/tests/extractors/links.py index 8539465e..ea15a459 100644 --- a/tests/extractors/links.py +++ b/tests/extractors/links.py @@ -21,7 +21,9 @@ limitations under the License. """ -from base import TestExtractionBase +from __future__ import absolute_import + +from .base import TestExtractionBase class TestArticleLinks(TestExtractionBase): diff --git a/tests/extractors/metas.py b/tests/extractors/metas.py index fd45915a..a4eef74c 100644 --- a/tests/extractors/metas.py +++ b/tests/extractors/metas.py @@ -21,7 +21,9 @@ limitations under the License. """ -from base import TestExtractionBase +from __future__ import absolute_import + +from .base import TestExtractionBase class TestMetas(TestExtractionBase): diff --git a/tests/extractors/opengraph.py b/tests/extractors/opengraph.py index 415a784c..a0616227 100644 --- a/tests/extractors/opengraph.py +++ b/tests/extractors/opengraph.py @@ -21,7 +21,9 @@ limitations under the License. """ -from base import TestExtractionBase +from __future__ import absolute_import + +from .base import TestExtractionBase class TestOpenGraph(TestExtractionBase): diff --git a/tests/extractors/publishdate.py b/tests/extractors/publishdate.py index 8d2a13b9..355250d5 100644 --- a/tests/extractors/publishdate.py +++ b/tests/extractors/publishdate.py @@ -21,7 +21,9 @@ limitations under the License. """ -from base import TestExtractionBase +from __future__ import absolute_import + +from .base import TestExtractionBase class TestPublishDate(TestExtractionBase): diff --git a/tests/extractors/tags.py b/tests/extractors/tags.py index 22b17129..2f5562ba 100644 --- a/tests/extractors/tags.py +++ b/tests/extractors/tags.py @@ -21,7 +21,9 @@ limitations under the License. """ -from base import TestExtractionBase +from __future__ import absolute_import + +from .base import TestExtractionBase class TestArticleTags(TestExtractionBase): diff --git a/tests/extractors/title.py b/tests/extractors/title.py index 09170205..c6f7813c 100644 --- a/tests/extractors/title.py +++ b/tests/extractors/title.py @@ -21,7 +21,9 @@ limitations under the License. """ -from base import TestExtractionBase +from __future__ import absolute_import + +from .base import TestExtractionBase class TestTitle(TestExtractionBase): diff --git a/tests/extractors/tweets.py b/tests/extractors/tweets.py index 50300f43..3f72a604 100644 --- a/tests/extractors/tweets.py +++ b/tests/extractors/tweets.py @@ -20,8 +20,9 @@ See the License for the specific language governing permissions and limitations under the License. """ +from __future__ import absolute_import -from base import TestExtractionBase +from .base import TestExtractionBase class TestArticleTweet(TestExtractionBase): diff --git a/tests/extractors/videos.py b/tests/extractors/videos.py index 10be15ff..0350c8c3 100644 --- a/tests/extractors/videos.py +++ b/tests/extractors/videos.py @@ -20,7 +20,9 @@ See the License for the specific language governing permissions and limitations under the License. """ -from base import TestExtractionBase +from __future__ import absolute_import + +from .base import TestExtractionBase class ImageExtractionTests(TestExtractionBase): diff --git a/tests/parsers.py b/tests/parsers.py index 6614368d..7b47d89e 100644 --- a/tests/parsers.py +++ b/tests/parsers.py @@ -22,6 +22,7 @@ """ import os import unittest +import sys from goose.utils import FileHelper from goose.parsers import Parser @@ -260,5 +261,7 @@ class TestParser(ParserBase): class TestParserSoup(ParserBase): + + @unittest.skipIf(sys.version_info.major != 2, "supported only in python2") def setUp(self): self.parser = ParserSoup From 6d9156595ffa093febad6e137f877add162e269a Mon Sep 17 00:00:00 2001 From: Shevchenko Vitaliy Date: Wed, 8 Apr 2015 19:19:53 +0300 Subject: [PATCH 02/14] Update requirements --- requirements.txt | 3 ++- setup.py | 9 ++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 7e6a6c09..bbd377ee 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,5 +2,6 @@ Pillow lxml cssselect jieba -beautifulsoup +beautifulsoup # Only on python2 nltk +six diff --git a/setup.py b/setup.py index fbe60081..df367682 100644 --- a/setup.py +++ b/setup.py @@ -22,6 +22,8 @@ """ import os +import sys + from setuptools import setup, find_packages from imp import load_source @@ -53,6 +55,11 @@ except Exception: long_description = description +requirements = ['Pillow', 'lxml', 'cssselect', 'jieba', 'nltk', 'six'] +if sys.version_info.major == 2: + requirements.append('beautifulsoup') + + setup(name='goose-extractor', version=version.__version__, description=description, @@ -66,6 +73,6 @@ packages=find_packages(), include_package_data=True, zip_safe=False, - install_requires=['Pillow', 'lxml', 'cssselect', 'jieba', 'nltk'], + install_requires=requirements, test_suite="tests" ) From 79a12dd349efdb77c2762e231b8dc6e5c4166a2b Mon Sep 17 00:00:00 2001 From: Shevchenko Vitaliy Date: Thu, 9 Apr 2015 11:58:29 +0300 Subject: [PATCH 03/14] Add python3 to CLASSIFIERS --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index df367682..ab03c825 100644 --- a/setup.py +++ b/setup.py @@ -42,6 +42,7 @@ 'Programming Language :: Python :: 2', 'Programming Language :: Python :: 2.6', 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3', 'Topic :: Internet', 'Topic :: Utilities', 'Topic :: Software Development :: Libraries :: Python Modules'] From 76af358635237859917fec2e8f81cb17c6004934 Mon Sep 17 00:00:00 2001 From: Shevchenko Vitaliy Date: Thu, 9 Apr 2015 12:21:19 +0300 Subject: [PATCH 04/14] Optimize imports --- goose/__init__.py | 1 - goose/extractors/images.py | 6 +----- goose/extractors/metas.py | 6 ++---- goose/image.py | 8 ++------ goose/outputformatters.py | 5 +---- goose/utils/images.py | 7 +++---- goose/video.py | 1 + tests/extractors/base.py | 5 +---- 8 files changed, 11 insertions(+), 28 deletions(-) diff --git a/goose/__init__.py b/goose/__init__.py index f267fa34..d1cd6da8 100644 --- a/goose/__init__.py +++ b/goose/__init__.py @@ -21,7 +21,6 @@ limitations under the License. """ import os -import platform from tempfile import mkstemp from goose.version import version_info, __version__ diff --git a/goose/extractors/images.py b/goose/extractors/images.py index 2dd63786..f258aead 100644 --- a/goose/extractors/images.py +++ b/goose/extractors/images.py @@ -23,11 +23,7 @@ import re import os -try: - from urlparse import urlparse, urljoin -except ImportError: - from urllib.parse import urlparse, urljoin - +from six.moves.urllib.parse import urlparse, urljoin from goose.extractors import BaseExtractor from goose.image import Image diff --git a/goose/extractors/metas.py b/goose/extractors/metas.py index ee7d520e..5a65aa16 100644 --- a/goose/extractors/metas.py +++ b/goose/extractors/metas.py @@ -22,10 +22,8 @@ """ import re -try: - from urlparse import urlparse, urljoin -except ImportError: - from urllib.parse import urlparse, urljoin + +from six.moves.urllib.parse import urlparse, urljoin from goose.extractors import BaseExtractor diff --git a/goose/image.py b/goose/image.py index 23026398..58ddd021 100644 --- a/goose/image.py +++ b/goose/image.py @@ -20,10 +20,6 @@ See the License for the specific language governing permissions and limitations under the License. """ -try: - long -except NameError: - long = int class Image(object): @@ -50,7 +46,7 @@ def __init__(self): self.extraction_type = "NA" # stores how many bytes this image is. - self.bytes = long(0) + self.bytes = 0 def get_src(self): return self.src @@ -91,7 +87,7 @@ def set_mime_type(self, mime_type): class LocallyStoredImage(object): def __init__(self, src='', local_filename='', - link_hash='', bytes=long(0), file_extension='', height=0, width=0): + link_hash='', bytes=0, file_extension='', height=0, width=0): self.src = src self.local_filename = local_filename self.link_hash = link_hash diff --git a/goose/outputformatters.py b/goose/outputformatters.py index 21dab451..808f2eee 100644 --- a/goose/outputformatters.py +++ b/goose/outputformatters.py @@ -20,10 +20,7 @@ See the License for the specific language governing permissions and limitations under the License. """ -try: - from HTMLParser import HTMLParser -except ImportError: - from html.parser import HTMLParser +from six.moves.html_parser import HTMLParser from goose.text import innerTrim diff --git a/goose/utils/images.py b/goose/utils/images.py index 76a8c72f..92d5a133 100644 --- a/goose/utils/images.py +++ b/goose/utils/images.py @@ -22,10 +22,9 @@ """ import hashlib import os -try: - from urllib2 import urlopen, Request -except ImportError: - from urllib.request import urlopen, Request + +from six.moves.urllib.request import urlopen, Request + from PIL import Image from goose.utils.encoding import smart_str diff --git a/goose/video.py b/goose/video.py index 8509bba0..0691ac96 100644 --- a/goose/video.py +++ b/goose/video.py @@ -21,6 +21,7 @@ limitations under the License. """ + class Video(object): """\ Video object diff --git a/tests/extractors/base.py b/tests/extractors/base.py index 72d4c601..a0849e35 100644 --- a/tests/extractors/base.py +++ b/tests/extractors/base.py @@ -30,10 +30,7 @@ except ImportError: import urllib.request as urllib2 -try: - from StringIO import StringIO -except ImportError: - from io import StringIO +from six import StringIO from goose import Goose from goose.utils import FileHelper From f44c2af9e6eee9ac21f45612f1a6b76ee3682cd0 Mon Sep 17 00:00:00 2001 From: Shevchenko Vitaliy Date: Thu, 9 Apr 2015 12:37:33 +0300 Subject: [PATCH 05/14] Restore python 2.6 support --- setup.py | 2 +- tests/parsers.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index ab03c825..c046ed82 100644 --- a/setup.py +++ b/setup.py @@ -57,7 +57,7 @@ long_description = description requirements = ['Pillow', 'lxml', 'cssselect', 'jieba', 'nltk', 'six'] -if sys.version_info.major == 2: +if sys.version_info[0] == 2: requirements.append('beautifulsoup') diff --git a/tests/parsers.py b/tests/parsers.py index 7b47d89e..812cc6b2 100644 --- a/tests/parsers.py +++ b/tests/parsers.py @@ -22,7 +22,7 @@ """ import os import unittest -import sys +import six from goose.utils import FileHelper from goose.parsers import Parser @@ -262,6 +262,6 @@ class TestParser(ParserBase): class TestParserSoup(ParserBase): - @unittest.skipIf(sys.version_info.major != 2, "supported only in python2") + @unittest.skipIf(six.PY3, "supported only in python2") def setUp(self): self.parser = ParserSoup From 2e18083b9f52a903e53e5bbe30b55d28b37d181b Mon Sep 17 00:00:00 2001 From: Shevchenko Vitaliy Date: Thu, 9 Apr 2015 13:06:08 +0300 Subject: [PATCH 06/14] Try to fix tests in python 2.6 --- setup.py | 2 ++ tests/parsers.py | 6 +++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index c046ed82..bce19c5c 100644 --- a/setup.py +++ b/setup.py @@ -59,6 +59,8 @@ requirements = ['Pillow', 'lxml', 'cssselect', 'jieba', 'nltk', 'six'] if sys.version_info[0] == 2: requirements.append('beautifulsoup') + if sys.version_info[1] < 7: + requirements.append('unittest2') setup(name='goose-extractor', diff --git a/tests/parsers.py b/tests/parsers.py index 812cc6b2..41aa5934 100644 --- a/tests/parsers.py +++ b/tests/parsers.py @@ -21,7 +21,11 @@ limitations under the License. """ import os -import unittest +try: + import unittest2 as unittest # Need to support skipIf in python 2.6 +except ImportError: + import unittest + import six from goose.utils import FileHelper From b7884f1c3e219dd2c4fa473a10f0aba80618e628 Mon Sep 17 00:00:00 2001 From: Shevchenko Vitaliy Date: Wed, 15 Apr 2015 17:09:05 +0300 Subject: [PATCH 07/14] Fix smart_str --- goose/utils/encoding.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/goose/utils/encoding.py b/goose/utils/encoding.py index eb98917c..25022704 100644 --- a/goose/utils/encoding.py +++ b/goose/utils/encoding.py @@ -115,9 +115,13 @@ def smart_str(s, encoding='utf-8', strings_only=False, errors='strict'): return s # if isinstance(s, Promise): # return unicode(s).encode(encoding, errors) - if not isinstance(s, six.string_types): + if isinstance(s, six.text_type): + return s.encode(encoding, errors) + elif not isinstance(s, (six.binary_type, six.string_types)): try: - return str(s) + if six.PY2: + return str(s) + return str(s).encode(encoding, errors) except UnicodeEncodeError: if isinstance(s, Exception): # An Exception subclass containing non-ASCII data that doesn't @@ -126,8 +130,6 @@ def smart_str(s, encoding='utf-8', strings_only=False, errors='strict'): return ' '.join([smart_str(arg, encoding, strings_only, errors) for arg in s]) return six.text_type(s).encode(encoding, errors) - elif isinstance(s, six.text_type): - return s.encode(encoding, errors) elif s and encoding != 'utf-8': return s.decode('utf-8', errors).encode(encoding, errors) else: From 90287612f6517ab07655acfe28cf43fd16ef8a1b Mon Sep 17 00:00:00 2001 From: Shevchenko Vitaliy Date: Wed, 15 Apr 2015 17:10:30 +0300 Subject: [PATCH 08/14] Fix ValueError if we get document with set encoding. Add test case for this. --- goose/parsers.py | 3 +-- goose/text.py | 34 +++++++++++++++++++++++++++++++--- tests/parsers.py | 15 +++++++++++++++ 3 files changed, 47 insertions(+), 5 deletions(-) diff --git a/goose/parsers.py b/goose/parsers.py index c0b091a9..61d6510e 100644 --- a/goose/parsers.py +++ b/goose/parsers.py @@ -26,8 +26,7 @@ from lxml import etree from copy import deepcopy -from goose.text import innerTrim -from goose.text import encodeValue +from goose.text import innerTrim, encodeValue, get_encodings_from_content class Parser(object): diff --git a/goose/text.py b/goose/text.py index 02846e20..960d0608 100644 --- a/goose/text.py +++ b/goose/text.py @@ -34,6 +34,28 @@ TABSSPACE = re.compile(r'[\s\t]+') +def get_encodings_from_content(content): + """ + Code from: + https://github.com/sigmavirus24/requests-toolbelt/blob/master/requests_toolbelt/utils/deprecated.py + Return encodings from given content string. + :param content: string to extract encodings from. + """ + find_charset = re.compile( + r']', flags=re.I + ).findall + + find_pragma = re.compile( + r']', flags=re.I + ).findall + + find_xml = re.compile( + r'^<\?xml.*?encoding=["\']*(.+?)["\'>]' + ).findall + + return find_charset(content) + find_pragma(content) + find_xml(content) + + def innerTrim(value): if isinstance(value, (six.text_type, six.string_types)): # remove tab and white space @@ -46,9 +68,15 @@ def innerTrim(value): def encodeValue(value): string_org = value try: - value = smart_unicode(value) - except (UnicodeEncodeError, DjangoUnicodeDecodeError): - value = smart_str(value) + encoding = get_encodings_from_content(value) + if encoding: + # If encoding is set we must pass bytes to lxml.html.fromstring or will get exception; + value = smart_str(value) + else: + try: + value = smart_unicode(value) + except (UnicodeEncodeError, DjangoUnicodeDecodeError): + value = smart_str(value) except Exception: value = string_org return value diff --git a/tests/parsers.py b/tests/parsers.py index 41aa5934..e5f17164 100644 --- a/tests/parsers.py +++ b/tests/parsers.py @@ -259,6 +259,21 @@ def test_delAttribute(self): # remove an unexistant attribute self.parser.delAttribute(div, attr="bla") + def test_encoding(self): + """ + If pass unicode string to lxml.html.fromstring with encoding set in document will receive: + "ValueError: Unicode strings with encoding declaration are not supported. + Please use bytes input or XML fragments without declaration." + Test for this case. + """ + html = u""" + + """ + html += u'' + html += u'

Я рядочок

' + html += u'' + self.parser.fromstring(html) + class TestParser(ParserBase): pass From 74743ab334af9c9c64e17c92227ceaca2fbc3ba9 Mon Sep 17 00:00:00 2001 From: Shevchenko Vitaliy Date: Wed, 15 Apr 2015 18:55:11 +0300 Subject: [PATCH 09/14] Add py 3.4 to travis.yml --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 2f2c722e..4b341e25 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,6 +3,7 @@ language: python python: - 2.6 - 2.7 + - 3.4 install: - pip install -r requirements.txt --use-mirrors From 5fbc788bd4340edaeb64012b1e6c1b8141bbf1cf Mon Sep 17 00:00:00 2001 From: Shevchenko Vitaliy Date: Wed, 15 Apr 2015 22:32:37 +0300 Subject: [PATCH 10/14] Remove install from requirements file in travis.yml as no way to avoid install bs3 under py3 --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 4b341e25..a242d0ac 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,7 +6,7 @@ python: - 3.4 install: - - pip install -r requirements.txt --use-mirrors + - pip install jieba - python setup.py install script: python setup.py test From 1d029324a11305d9c93eae5181c33fda17c757cc Mon Sep 17 00:00:00 2001 From: Shevchenko Vitaliy Date: Wed, 15 Apr 2015 22:32:57 +0300 Subject: [PATCH 11/14] Close image file after use --- goose/utils/images.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/goose/utils/images.py b/goose/utils/images.py index 92d5a133..31a55d61 100644 --- a/goose/utils/images.py +++ b/goose/utils/images.py @@ -38,9 +38,9 @@ class ImageUtils(object): def get_image_dimensions(self, identify_program, path): image_details = ImageDetails() try: - image = Image.open(path) - image_details.set_mime_type(image.format) - width, height = image.size + with Image.open(path) as image: + image_details.set_mime_type(image.format) + width, height = image.size image_details.set_width(width) image_details.set_height(height) except IOError: From 9091e3827cae539b3bacd2cabd5eb3f4c39225ea Mon Sep 17 00:00:00 2001 From: Shevchenko Vitaliy Date: Wed, 15 Apr 2015 22:35:10 +0300 Subject: [PATCH 12/14] Fix tests --- goose/parsers.py | 13 ++++++++++--- goose/text.py | 12 +++--------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/goose/parsers.py b/goose/parsers.py index 61d6510e..fab3eb31 100644 --- a/goose/parsers.py +++ b/goose/parsers.py @@ -26,7 +26,7 @@ from lxml import etree from copy import deepcopy -from goose.text import innerTrim, encodeValue, get_encodings_from_content +from goose.text import innerTrim, encodeValue, get_encodings_from_content, smart_str class Parser(object): @@ -51,8 +51,15 @@ def css_select(self, node, selector): @classmethod def fromstring(self, html): - html = encodeValue(html) - self.doc = lxml.html.fromstring(html) + encoding = get_encodings_from_content(html) + encoding = encoding and encoding[0] or None + if not encoding: + html = encodeValue(html) + self.doc = lxml.html.fromstring(html) + else: + html = smart_str(html, encoding=encoding) + parser = lxml.html.HTMLParser(encoding=encoding) + self.doc = lxml.html.fromstring(html, parser=parser) return self.doc @classmethod diff --git a/goose/text.py b/goose/text.py index 960d0608..343fdbc2 100644 --- a/goose/text.py +++ b/goose/text.py @@ -68,15 +68,9 @@ def innerTrim(value): def encodeValue(value): string_org = value try: - encoding = get_encodings_from_content(value) - if encoding: - # If encoding is set we must pass bytes to lxml.html.fromstring or will get exception; - value = smart_str(value) - else: - try: - value = smart_unicode(value) - except (UnicodeEncodeError, DjangoUnicodeDecodeError): - value = smart_str(value) + value = smart_unicode(value) + except (UnicodeEncodeError, DjangoUnicodeDecodeError): + value = smart_str(value) except Exception: value = string_org return value From 8fa55b4ebc41b2ebda2edfb309c01719769c9549 Mon Sep 17 00:00:00 2001 From: Shevchenko Vitaliy Date: Thu, 16 Apr 2015 21:57:27 +0300 Subject: [PATCH 13/14] Fix encoding detection --- goose/text.py | 36 ++++++++++++++++++++++++------------ goose/utils/encoding.py | 4 +--- 2 files changed, 25 insertions(+), 15 deletions(-) diff --git a/goose/text.py b/goose/text.py index 343fdbc2..31070cf0 100644 --- a/goose/text.py +++ b/goose/text.py @@ -41,18 +41,30 @@ def get_encodings_from_content(content): Return encodings from given content string. :param content: string to extract encodings from. """ - find_charset = re.compile( - r']', flags=re.I - ).findall - - find_pragma = re.compile( - r']', flags=re.I - ).findall - - find_xml = re.compile( - r'^<\?xml.*?encoding=["\']*(.+?)["\'>]' - ).findall - + if isinstance(content, six.binary_type) and six.PY3: + find_charset = re.compile( + br']', flags=re.I + ).findall + + find_pragma = re.compile( + br']', flags=re.I + ).findall + + find_xml = re.compile( + br'^<\?xml.*?encoding=["\']*(.+?)["\'>]' + ).findall + else: + find_charset = re.compile( + r']', flags=re.I + ).findall + + find_pragma = re.compile( + r']', flags=re.I + ).findall + + find_xml = re.compile( + r'^<\?xml.*?encoding=["\']*(.+?)["\'>]' + ).findall return find_charset(content) + find_pragma(content) + find_xml(content) diff --git a/goose/utils/encoding.py b/goose/utils/encoding.py index 25022704..f94f476e 100644 --- a/goose/utils/encoding.py +++ b/goose/utils/encoding.py @@ -117,7 +117,7 @@ def smart_str(s, encoding='utf-8', strings_only=False, errors='strict'): # return unicode(s).encode(encoding, errors) if isinstance(s, six.text_type): return s.encode(encoding, errors) - elif not isinstance(s, (six.binary_type, six.string_types)): + elif not isinstance(s, six.binary_type): try: if six.PY2: return str(s) @@ -130,7 +130,5 @@ def smart_str(s, encoding='utf-8', strings_only=False, errors='strict'): return ' '.join([smart_str(arg, encoding, strings_only, errors) for arg in s]) return six.text_type(s).encode(encoding, errors) - elif s and encoding != 'utf-8': - return s.decode('utf-8', errors).encode(encoding, errors) else: return s From 1ef277b5784887e93c952a4a7cf5ce3a5fb993b4 Mon Sep 17 00:00:00 2001 From: Shevchenko Vitaliy Date: Thu, 16 Apr 2015 21:57:44 +0300 Subject: [PATCH 14/14] Fix test runner under py3 --- tests/extractors/base.py | 10 +++++++--- tests/extractors/images.py | 2 +- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/extractors/base.py b/tests/extractors/base.py index a0849e35..93b3c075 100644 --- a/tests/extractors/base.py +++ b/tests/extractors/base.py @@ -30,7 +30,8 @@ except ImportError: import urllib.request as urllib2 -from six import StringIO +import six +from six import StringIO, BytesIO from goose import Goose from goose.utils import FileHelper @@ -51,13 +52,16 @@ class MockResponse(): def __init__(self, cls): self.cls = cls - def content(self): + def content(self, req): return "response" def response(self, req): data = self.content(req) url = req.get_full_url() - resp = urllib2.addinfourl(StringIO(data), data, url) + if isinstance(data, six.binary_type): + resp = urllib2.addinfourl(BytesIO(data), data, url) + else: + resp = urllib2.addinfourl(StringIO(data), data, url) resp.code = self.code resp.msg = self.msg return resp diff --git a/tests/extractors/images.py b/tests/extractors/images.py index 896d6985..9a9712a1 100644 --- a/tests/extractors/images.py +++ b/tests/extractors/images.py @@ -43,7 +43,7 @@ class MockResponseImage(MockResponse): def image_content(self, req): - md5_hash = hashlib.md5(req.get_full_url()).hexdigest() + md5_hash = hashlib.md5(req.get_full_url().encode("utf-8")).hexdigest() current_test = self.cls._get_current_testname() path = os.path.join( os.path.dirname(CURRENT_PATH),