From 1a3bf1200de2929faef242a84bf7302dd6089be2 Mon Sep 17 00:00:00 2001 From: Edward Stone Date: Tue, 8 Oct 2013 11:34:51 +0100 Subject: [PATCH 1/5] tests and docstring for broadcast mentions and retweets --- ttp/tests.py | 29 +++++++++++++++++++++++++++++ ttp/ttp.py | 9 +++++++++ 2 files changed, 38 insertions(+) diff --git a/ttp/tests.py b/ttp/tests.py index 39aa5ab..051424f 100644 --- a/ttp/tests.py +++ b/ttp/tests.py @@ -509,6 +509,35 @@ def test_username_non_reply(self): self.assertEqual(result.users, [u'username']) self.assertEqual(result.reply, None) + # Broadcast mentions + def test_username_broadcast_mention_at_start(self): + result = self.parser.parse(u'.@username') + self.assertEqual(result.html, u'.@username') + self.assertEqual(result.users, [u'username']) + self.assertEqual(result.broadcast, u'username') + + def test_username_broadcast_mention_in_middle(self): + result = self.parser.parse(u'something .@username') + self.assertEqual(result.html, u'something .@username') + self.assertEqual(result.users, [u'username']) + self.assertEqual(result.broadcast, u'username') + + # Retweets + def test_username_old_style_retweet(self): + result = self.parser.parse(u'retweet RT @username something') + self.assertEqual(result.html, u'retweet RT @username something') + self.assertEqual(result.retweet, u'username') + + def test_username_quoted_retweet(self): + result = self.parser.parse(u'retweet "@username something"') + self.assertEqual(result.html, u'retweet "@username something"') + self.assertEqual(result.retweet, u'username') + + def test_username_curly_quoted_retweet(self): + result = self.parser.parse(u'retweet “@username something”') + self.assertEqual(result.html, u'retweet “@username something”') + self.assertEqual(result.retweet, u'username') + # List tests --------------------------------------------------------------- # -------------------------------------------------------------------------- def test_list_preceeded(self): diff --git a/ttp/ttp.py b/ttp/ttp.py index ac7c79e..9cd2b3d 100644 --- a/ttp/ttp.py +++ b/ttp/ttp.py @@ -86,6 +86,15 @@ class ParseResult(object): Note: It's generally better to rely on the Tweet JSON/XML in order to find out if it's a reply or not. + - broadcast + A string containing the username this tweet was a broadcast mention to + (e.g. preceeded by a dot to ensure that mention is broadcast even to + non-followers of that username) + + - retweet + A string containing the username this tweet was a retweet of (as indicated by + either old-style RT or encased in quotes) + - lists A list containing all the valid lists in the Tweet. Each list item is a tuple in the format (username, listname). From eb2df2b4fe092431b179b530ce76f8ae8bd3d366 Mon Sep 17 00:00:00 2001 From: Edward Stone Date: Tue, 8 Oct 2013 14:13:01 +0100 Subject: [PATCH 2/5] check for broadcast mentions in tweet text --- ttp/ttp.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/ttp/ttp.py b/ttp/ttp.py index 9cd2b3d..1a205c9 100644 --- a/ttp/ttp.py +++ b/ttp/ttp.py @@ -41,6 +41,7 @@ USERNAME_REGEX = re.compile(ur'\B' + AT_SIGNS + LIST_END_CHARS, re.IGNORECASE) REPLY_REGEX = re.compile(ur'^(?:' + SPACES + ur')*' + AT_SIGNS + ur'([a-z0-9_]{1,20}).*', re.IGNORECASE) +BROADCAST_REGEX = re.compile('.' + AT_SIGNS + LIST_END_CHARS, re.IGNORECASE) # Hashtags HASHTAG_EXP = ur'(^|[^0-9A-Z&/]+)(#|\uff03)([0-9A-Z_]*[A-Z_]+[%s]*)' % UTF_CHARS @@ -109,11 +110,12 @@ class ParseResult(object): ''' - def __init__(self, urls, users, reply, lists, tags, html): + def __init__(self, urls, users, reply, broadcast, lists, tags, html): self.urls = urls if urls else [] self.users = users if users else [] self.lists = lists if lists else [] self.reply = reply if reply else None + self.broadcast = broadcast if broadcast else None self.tags = tags if tags else [] self.html = html @@ -136,8 +138,11 @@ def parse(self, text, html=True): reply = REPLY_REGEX.match(text) reply = reply.groups(0)[0] if reply is not None else None + broadcast = BROADCAST_REGEX.search(text) + broadcast = broadcast.groups(0)[0] if broadcast is not None else None + parsed_html = self._html(text) if html else self._text(text) - return ParseResult(self._urls, self._users, reply, + return ParseResult(self._urls, self._users, reply, broadcast, self._lists, self._tags, parsed_html) def _text(self, text): From 9dd05252e906b86acc1c7001b577d650ffa366d4 Mon Sep 17 00:00:00 2001 From: Edward Stone Date: Tue, 8 Oct 2013 14:16:03 +0100 Subject: [PATCH 3/5] couple more old-style retweet tests --- ttp/tests.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/ttp/tests.py b/ttp/tests.py index 051424f..6df8ca0 100644 --- a/ttp/tests.py +++ b/ttp/tests.py @@ -528,6 +528,11 @@ def test_username_old_style_retweet(self): self.assertEqual(result.html, u'retweet RT @username something') self.assertEqual(result.retweet, u'username') + def test_username_old_style_retweet_at_beginning(self): + result = self.parser.parse(u'RT @username something') + self.assertEqual(result.html, u'RT @username something') + self.assertEqual(result.retweet, u'username') + def test_username_quoted_retweet(self): result = self.parser.parse(u'retweet "@username something"') self.assertEqual(result.html, u'retweet "@username something"') @@ -538,6 +543,16 @@ def test_username_curly_quoted_retweet(self): self.assertEqual(result.html, u'retweet “@username something”') self.assertEqual(result.retweet, u'username') + def test_username_quoted_retweet_at_beginning(self): + result = self.parser.parse(u'"@username something"') + self.assertEqual(result.html, u'"@username something"') + self.assertEqual(result.retweet, u'username') + + def test_username_curly_quoted_retweet_at_beginning(self): + result = self.parser.parse(u'“@username something”') + self.assertEqual(result.html, u'“@username something”') + self.assertEqual(result.retweet, u'username') + # List tests --------------------------------------------------------------- # -------------------------------------------------------------------------- def test_list_preceeded(self): From 7aea356b7acb005546bb667c87cb39bca7a31246 Mon Sep 17 00:00:00 2001 From: Edward Stone Date: Tue, 8 Oct 2013 14:39:15 +0100 Subject: [PATCH 4/5] check for old-style retweet in tweet text --- ttp/ttp.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/ttp/ttp.py b/ttp/ttp.py index 1a205c9..4d0fd8d 100644 --- a/ttp/ttp.py +++ b/ttp/ttp.py @@ -30,6 +30,7 @@ AT_SIGNS = ur'[@\uff20]' UTF_CHARS = ur'a-z0-9_\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u00ff' SPACES = ur'[\u0020\u00A0\u1680\u180E\u2002-\u202F\u205F\u2060\u3000]' +QUOTES = ur'[\u0022\u201C]' # Lists LIST_PRE_CHARS = ur'([^a-z0-9_]|^)' @@ -42,6 +43,7 @@ REPLY_REGEX = re.compile(ur'^(?:' + SPACES + ur')*' + AT_SIGNS + ur'([a-z0-9_]{1,20}).*', re.IGNORECASE) BROADCAST_REGEX = re.compile('.' + AT_SIGNS + LIST_END_CHARS, re.IGNORECASE) +RETWEET_REGEX = re.compile('(?:RT' + SPACES + '|' + QUOTES + ')' + AT_SIGNS + LIST_END_CHARS, re.IGNORECASE) # Hashtags HASHTAG_EXP = ur'(^|[^0-9A-Z&/]+)(#|\uff03)([0-9A-Z_]*[A-Z_]+[%s]*)' % UTF_CHARS @@ -110,12 +112,13 @@ class ParseResult(object): ''' - def __init__(self, urls, users, reply, broadcast, lists, tags, html): + def __init__(self, urls, users, reply, broadcast, retweet, lists, tags, html): self.urls = urls if urls else [] self.users = users if users else [] self.lists = lists if lists else [] self.reply = reply if reply else None self.broadcast = broadcast if broadcast else None + self.retweet = retweet if retweet else None self.tags = tags if tags else [] self.html = html @@ -141,8 +144,11 @@ def parse(self, text, html=True): broadcast = BROADCAST_REGEX.search(text) broadcast = broadcast.groups(0)[0] if broadcast is not None else None + retweet = RETWEET_REGEX.search(text) + retweet = retweet.groups(0)[0] if retweet is not None else None + parsed_html = self._html(text) if html else self._text(text) - return ParseResult(self._urls, self._users, reply, broadcast, + return ParseResult(self._urls, self._users, reply, broadcast, retweet, self._lists, self._tags, parsed_html) def _text(self, text): From 67501e3944c723bca6d69d57589adc317aa9090b Mon Sep 17 00:00:00 2001 From: Edward Stone Date: Tue, 8 Oct 2013 14:51:08 +0100 Subject: [PATCH 5/5] add docs for broadcast mentions and retweets to README --- README.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/README.rst b/README.rst index 17b0189..ab03983 100644 --- a/README.rst +++ b/README.rst @@ -36,6 +36,17 @@ Usage:: If you need different HTML output just subclass and override the ``format_*`` methods. +"broadcast" mentions and old-style retweets are now available as well: + + >>> from ttp import ttp + >>> p = ttp.Parser() + >>> result = p.parse(".@eadmundo has added broadcast mentions!") + >>> result.broadcast + 'eadmundo' + >>> result = p.parse("RT @eadmundo, also old-style retweets") + >>> result.retweet + 'eadmundo' + You can also ask for the span tags to be returned for each entity:: >>> p = ttp.Parser(include_spans=True)