Skip to content

Commit 911c63f

Browse files
committed
v1.1.5
encodings fixes
1 parent 666d6ad commit 911c63f

File tree

1 file changed

+14
-2
lines changed

1 file changed

+14
-2
lines changed

urlsresolver/__init__.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@
55
import re
66
from urlparse import urljoin
77

8-
__version__ = (1, 1, 4)
8+
from requests.packages import chardet
9+
10+
__version__ = (1, 1, 5)
911
__author__ = 'Alexandr Shurigin (https://github.com/phpdude/)'
1012

1113
# HTML tags syntax http://www.w3.org/TR/html-markup/syntax.html
@@ -83,7 +85,17 @@ def follow_meta_redirects(url, redirects, **kwargs):
8385
for r in resp.history:
8486
urls_history[r.url] = True
8587

86-
head, real_url = next(resp.iter_content(chunk_size, decode_unicode=False)), resp.url
88+
head, real_url = resp.iter_content(chunk_size).next(), resp.url
89+
90+
encoding = resp.encoding
91+
if encoding is None:
92+
# detect encoding
93+
encoding = chardet.detect(head)['encoding']
94+
95+
try:
96+
head = unicode(head, encoding, errors='replace')
97+
except (LookupError, TypeError):
98+
head = unicode(head, errors='replace')
8799

88100
# Removing html blocks in <noscript></noscript>
89101
if remove_noscript:

0 commit comments

Comments
 (0)