diff --git a/README.md b/README.md index 8720c96..1df0457 100644 --- a/README.md +++ b/README.md @@ -58,9 +58,9 @@ print(tokens) ``` ## Reddit version - -A version of the tokenizer sensitive to Reddit usernames is also provided. -Will work with either the `/u/name` or `u/name` format. +rr +A version of the tokenizer sensitive to Reddit usernames and subreddit is also provided. +Will work with either the `/u/name` or `u/name` and `/r/name` or `r/name` format. ``` from tokenizer import tokenizer diff --git a/tokenizer/test_tokenizer.py b/tokenizer/test_tokenizer.py index 4b3b46a..e706091 100644 --- a/tokenizer/test_tokenizer.py +++ b/tokenizer/test_tokenizer.py @@ -1,10 +1,12 @@ import unittest import tokenizer + class TestTokenizerDefaults(unittest.TestCase): def setUp(self): self.T = tokenizer.TweetTokenizer() + self.redditT = tokenizer.RedditTokenizer() def test_emoticon(self): text = 'this is a tweet with kitty =^^= emoticon' @@ -48,6 +50,18 @@ def test_handle(self): expected = ['a', 'tweet', 'at', '@some_handle', 'somewhere'] self.assertEqual(actual, expected) + def test_reddit_user(self): + text = "reddit with user u/reddit-name mention" + actual = self.redditT.tokenize(text) + expected = ['reddit', 'with', 'user', 'u/reddit-name', 'mention'] + self.assertEqual(actual, expected) + + def test_reddit_subreddit(self): + text = "reddit with r/subreddit mention" + actual = self.redditT.tokenize(text) + expected = ['reddit', 'with', 'r/subreddit', 'mention'] + self.assertEqual(actual, expected) + class TestTokenizerRegularizations(unittest.TestCase): def test_hash_removal(self): diff --git a/tokenizer/tokenizer.py b/tokenizer/tokenizer.py index 63fc61f..77d7d1a 100644 --- a/tokenizer/tokenizer.py +++ b/tokenizer/tokenizer.py @@ -63,10 +63,11 @@ item = re.escape(item) EMOTICONS.append(item) -# Twitter specific: +# Twitter & Reddit specific: HASHTAG = r"""(?:\#\w+)""" TWITTER_USER = r"""(?:@\w+)""" REDDIT_USER = r"(?:\/?u\/\w+)" +REDDIT_SUBREDDIT = r"(?:\/?r\/\w+)" #separately compiled regexps TWITTER_USER_RE = re.compile(TWITTER_USER, re.UNICODE) @@ -103,7 +104,7 @@ """ TWITTER_REGEXPS = [URLS, PHONE] + EMOTICONS + [HTML_TAGS, ASCII_ARROWS, TWITTER_USER, HASHTAG, EMAILS, WORDS] -REDDIT_REGEXPS = [URLS, PHONE] + EMOTICONS + [HTML_TAGS, ASCII_ARROWS, REDDIT_USER, HASHTAG, EMAILS, WORDS] +REDDIT_REGEXPS = [URLS, PHONE] + EMOTICONS + [HTML_TAGS, ASCII_ARROWS, REDDIT_USER, REDDIT_SUBREDDIT, HASHTAG, EMAILS, WORDS] class TweetTokenizer():