-
Notifications
You must be signed in to change notification settings - Fork 10
Description
Hi
When I user p = ttp.Parser(); ttp.parse(tweettext, html=False); I get exceptions for some tweets due to invalid html character formatting as :
Traceback (most recent call last):
File "/home/bhanu/git/misc/twitter/src/preprocessing.py", line 34, in
extract_tweet_tags("twitter12051154249.txt")
File "/home/bhanu/git/misc/twitter/src/preprocessing.py", line 28, in extract_tweet_tags
result = ttp.Parser().parse(tweet.strip(), html=True)
File "/usr/local/lib/python2.7/dist-packages/ttp/ttp.py", line 131, in parse
parsed_html = self._html(text) if html else self._text(text)
File "/usr/local/lib/python2.7/dist-packages/ttp/ttp.py", line 148, in _html
return HASHTAG_REGEX.sub(self._parse_tags, html)
File "/usr/local/lib/python2.7/dist-packages/ttp/ttp.py", line 249, in _parse_tags
return '%s%s' % (pre, self.format_tag(tag, text))
File "/usr/local/lib/python2.7/dist-packages/ttp/ttp.py", line 270, in format_tag
% (urllib.quote('#' + text.encode('utf-8')), tag, text)
File "/usr/lib/python2.7/encodings/utf_8.py", line 16, in decode
return codecs.utf_8_decode(input, errors, True)
UnicodeDecodeError: 'utf8' codec can't decode byte 0xa5 in position 0: invalid start byte
I fixed it by adding a line in the following methods:
def parse(self, text, html=True):
'''Parse the text and return a ParseResult instance.'''
self._urls = []
self._users = []
self._lists = []
self._tags = []
self._is_html = html #added to fix a bug
reply = REPLY_REGEX.match(text)
reply = reply.groups(0)[0] if reply is not None else None
parsed_html = self._html(text) if html else self._text(text)
return ParseResult(self._urls, self._users, reply,
self._lists, self._tags, parsed_html)
def _parse_tags(self, match):
'''Parse hashtags.'''
mat = match.group(0)
# Fix problems with the regex capturing stuff infront of the #
tag = None
for i in u'#\uff03':
pos = mat.rfind(i)
if pos != -1:
tag = i
break
pre, text = mat[:pos], mat[pos + 1:]
if self._include_spans:
span = match.span(0)
# add an offset if pre is e.g. ' '
span = (span[0] + len(pre), span[1])
self._tags.append((text, span))
else:
self._tags.append(text)
if self._is_html: #self._html: changed to fix a bug
return '%s%s' % (pre, self.format_tag(tag, text))