diff --git a/README.rst b/README.rst index 17b0189..ab03983 100644 --- a/README.rst +++ b/README.rst @@ -36,6 +36,17 @@ Usage:: If you need different HTML output just subclass and override the ``format_*`` methods. +"broadcast" mentions and old-style retweets are now available as well: + + >>> from ttp import ttp + >>> p = ttp.Parser() + >>> result = p.parse(".@eadmundo has added broadcast mentions!") + >>> result.broadcast + 'eadmundo' + >>> result = p.parse("RT @eadmundo, also old-style retweets") + >>> result.retweet + 'eadmundo' + You can also ask for the span tags to be returned for each entity:: >>> p = ttp.Parser(include_spans=True) diff --git a/ttp/tests.py b/ttp/tests.py index 39aa5ab..6df8ca0 100644 --- a/ttp/tests.py +++ b/ttp/tests.py @@ -509,6 +509,50 @@ def test_username_non_reply(self): self.assertEqual(result.users, [u'username']) self.assertEqual(result.reply, None) + # Broadcast mentions + def test_username_broadcast_mention_at_start(self): + result = self.parser.parse(u'.@username') + self.assertEqual(result.html, u'.@username') + self.assertEqual(result.users, [u'username']) + self.assertEqual(result.broadcast, u'username') + + def test_username_broadcast_mention_in_middle(self): + result = self.parser.parse(u'something .@username') + self.assertEqual(result.html, u'something .@username') + self.assertEqual(result.users, [u'username']) + self.assertEqual(result.broadcast, u'username') + + # Retweets + def test_username_old_style_retweet(self): + result = self.parser.parse(u'retweet RT @username something') + self.assertEqual(result.html, u'retweet RT @username something') + self.assertEqual(result.retweet, u'username') + + def test_username_old_style_retweet_at_beginning(self): + result = self.parser.parse(u'RT @username something') + self.assertEqual(result.html, u'RT @username something') + self.assertEqual(result.retweet, u'username') + + def test_username_quoted_retweet(self): + result = self.parser.parse(u'retweet "@username something"') + self.assertEqual(result.html, u'retweet "@username something"') + self.assertEqual(result.retweet, u'username') + + def test_username_curly_quoted_retweet(self): + result = self.parser.parse(u'retweet “@username something”') + self.assertEqual(result.html, u'retweet “@username something”') + self.assertEqual(result.retweet, u'username') + + def test_username_quoted_retweet_at_beginning(self): + result = self.parser.parse(u'"@username something"') + self.assertEqual(result.html, u'"@username something"') + self.assertEqual(result.retweet, u'username') + + def test_username_curly_quoted_retweet_at_beginning(self): + result = self.parser.parse(u'“@username something”') + self.assertEqual(result.html, u'“@username something”') + self.assertEqual(result.retweet, u'username') + # List tests --------------------------------------------------------------- # -------------------------------------------------------------------------- def test_list_preceeded(self): diff --git a/ttp/ttp.py b/ttp/ttp.py index ac7c79e..4d0fd8d 100644 --- a/ttp/ttp.py +++ b/ttp/ttp.py @@ -30,6 +30,7 @@ AT_SIGNS = ur'[@\uff20]' UTF_CHARS = ur'a-z0-9_\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u00ff' SPACES = ur'[\u0020\u00A0\u1680\u180E\u2002-\u202F\u205F\u2060\u3000]' +QUOTES = ur'[\u0022\u201C]' # Lists LIST_PRE_CHARS = ur'([^a-z0-9_]|^)' @@ -41,6 +42,8 @@ USERNAME_REGEX = re.compile(ur'\B' + AT_SIGNS + LIST_END_CHARS, re.IGNORECASE) REPLY_REGEX = re.compile(ur'^(?:' + SPACES + ur')*' + AT_SIGNS + ur'([a-z0-9_]{1,20}).*', re.IGNORECASE) +BROADCAST_REGEX = re.compile('.' + AT_SIGNS + LIST_END_CHARS, re.IGNORECASE) +RETWEET_REGEX = re.compile('(?:RT' + SPACES + '|' + QUOTES + ')' + AT_SIGNS + LIST_END_CHARS, re.IGNORECASE) # Hashtags HASHTAG_EXP = ur'(^|[^0-9A-Z&/]+)(#|\uff03)([0-9A-Z_]*[A-Z_]+[%s]*)' % UTF_CHARS @@ -86,6 +89,15 @@ class ParseResult(object): Note: It's generally better to rely on the Tweet JSON/XML in order to find out if it's a reply or not. + - broadcast + A string containing the username this tweet was a broadcast mention to + (e.g. preceeded by a dot to ensure that mention is broadcast even to + non-followers of that username) + + - retweet + A string containing the username this tweet was a retweet of (as indicated by + either old-style RT or encased in quotes) + - lists A list containing all the valid lists in the Tweet. Each list item is a tuple in the format (username, listname). @@ -100,11 +112,13 @@ class ParseResult(object): ''' - def __init__(self, urls, users, reply, lists, tags, html): + def __init__(self, urls, users, reply, broadcast, retweet, lists, tags, html): self.urls = urls if urls else [] self.users = users if users else [] self.lists = lists if lists else [] self.reply = reply if reply else None + self.broadcast = broadcast if broadcast else None + self.retweet = retweet if retweet else None self.tags = tags if tags else [] self.html = html @@ -127,8 +141,14 @@ def parse(self, text, html=True): reply = REPLY_REGEX.match(text) reply = reply.groups(0)[0] if reply is not None else None + broadcast = BROADCAST_REGEX.search(text) + broadcast = broadcast.groups(0)[0] if broadcast is not None else None + + retweet = RETWEET_REGEX.search(text) + retweet = retweet.groups(0)[0] if retweet is not None else None + parsed_html = self._html(text) if html else self._text(text) - return ParseResult(self._urls, self._users, reply, + return ParseResult(self._urls, self._users, reply, broadcast, retweet, self._lists, self._tags, parsed_html) def _text(self, text):