Skip to content

UnicodeDecodeError #6

@pratapbhanu

Description

@pratapbhanu

Hi

When I user p = ttp.Parser(); ttp.parse(tweettext, html=False); I get exceptions for some tweets due to invalid html character formatting as :

Traceback (most recent call last):
File "/home/bhanu/git/misc/twitter/src/preprocessing.py", line 34, in
extract_tweet_tags("twitter12051154249.txt")
File "/home/bhanu/git/misc/twitter/src/preprocessing.py", line 28, in extract_tweet_tags
result = ttp.Parser().parse(tweet.strip(), html=True)
File "/usr/local/lib/python2.7/dist-packages/ttp/ttp.py", line 131, in parse
parsed_html = self._html(text) if html else self._text(text)
File "/usr/local/lib/python2.7/dist-packages/ttp/ttp.py", line 148, in _html
return HASHTAG_REGEX.sub(self._parse_tags, html)
File "/usr/local/lib/python2.7/dist-packages/ttp/ttp.py", line 249, in _parse_tags
return '%s%s' % (pre, self.format_tag(tag, text))
File "/usr/local/lib/python2.7/dist-packages/ttp/ttp.py", line 270, in format_tag
% (urllib.quote('#' + text.encode('utf-8')), tag, text)
File "/usr/lib/python2.7/encodings/utf_8.py", line 16, in decode
return codecs.utf_8_decode(input, errors, True)
UnicodeDecodeError: 'utf8' codec can't decode byte 0xa5 in position 0: invalid start byte

I fixed it by adding a line in the following methods:

def parse(self, text, html=True):
'''Parse the text and return a ParseResult instance.'''
self._urls = []
self._users = []
self._lists = []
self._tags = []
self._is_html = html #added to fix a bug

    reply = REPLY_REGEX.match(text)
    reply = reply.groups(0)[0] if reply is not None else None

    parsed_html = self._html(text) if html else self._text(text)
    return ParseResult(self._urls, self._users, reply,
                       self._lists, self._tags, parsed_html)

def _parse_tags(self, match):
'''Parse hashtags.'''

    mat = match.group(0)

    # Fix problems with the regex capturing stuff infront of the #
    tag = None
    for i in u'#\uff03':
        pos = mat.rfind(i)
        if pos != -1:
            tag = i
            break

    pre, text = mat[:pos], mat[pos + 1:]
    if self._include_spans:
        span = match.span(0)
        # add an offset if pre is e.g. ' '
        span = (span[0] + len(pre), span[1])
        self._tags.append((text, span))
    else:
        self._tags.append(text)

    if  self._is_html: #self._html:  changed to fix a bug
        return '%s%s' % (pre, self.format_tag(tag, text))

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions