erikavaris · newwaylw · Jul 3, 2017
diff --git a/README.md b/README.md
@@ -58,9 +58,9 @@ print(tokens)
 ```
 
 ## Reddit version
-
-A version of the tokenizer sensitive to Reddit usernames is also provided.
-Will work with either the `/u/name` or `u/name` format.
+rr
+A version of the tokenizer sensitive to Reddit usernames and subreddit is also provided.
+Will work with either the `/u/name` or `u/name` and `/r/name` or `r/name` format.
 
 ```
 from tokenizer import tokenizer

diff --git a/tokenizer/test_tokenizer.py b/tokenizer/test_tokenizer.py
@@ -1,10 +1,12 @@
 import unittest
 import tokenizer
 
+
 class TestTokenizerDefaults(unittest.TestCase):
 
     def setUp(self):
         self.T = tokenizer.TweetTokenizer()
+        self.redditT = tokenizer.RedditTokenizer()
 
     def test_emoticon(self):
         text = 'this is a tweet with kitty =^^= emoticon'
@@ -48,6 +50,18 @@ def test_handle(self):
         expected = ['a', 'tweet', 'at', '@some_handle', 'somewhere']
         self.assertEqual(actual, expected)
 
+    def test_reddit_user(self):
+        text = "reddit with user u/reddit-name mention"
+        actual = self.redditT.tokenize(text)
+        expected = ['reddit', 'with', 'user', 'u/reddit-name', 'mention']
+        self.assertEqual(actual, expected)
+
+    def test_reddit_subreddit(self):
+        text = "reddit with r/subreddit mention"
+        actual = self.redditT.tokenize(text)
+        expected = ['reddit', 'with', 'r/subreddit', 'mention']
+        self.assertEqual(actual, expected)
+
 class TestTokenizerRegularizations(unittest.TestCase):
 
     def test_hash_removal(self):

diff --git a/tokenizer/tokenizer.py b/tokenizer/tokenizer.py
@@ -63,10 +63,11 @@
         item = re.escape(item)
         EMOTICONS.append(item)
 
-# Twitter specific:
+# Twitter & Reddit specific:
 HASHTAG = r"""(?:\#\w+)"""
 TWITTER_USER = r"""(?:@\w+)"""
 REDDIT_USER = r"(?:\/?u\/\w+)"
+REDDIT_SUBREDDIT = r"(?:\/?r\/\w+)"
 
 #separately compiled regexps
 TWITTER_USER_RE = re.compile(TWITTER_USER, re.UNICODE)
@@ -103,7 +104,7 @@
     """
 TWITTER_REGEXPS = [URLS, PHONE] + EMOTICONS + [HTML_TAGS, ASCII_ARROWS, TWITTER_USER, HASHTAG, EMAILS, WORDS]
 
-REDDIT_REGEXPS = [URLS, PHONE] + EMOTICONS + [HTML_TAGS, ASCII_ARROWS, REDDIT_USER, HASHTAG, EMAILS, WORDS]
+REDDIT_REGEXPS = [URLS, PHONE] + EMOTICONS + [HTML_TAGS, ASCII_ARROWS, REDDIT_USER, REDDIT_SUBREDDIT, HASHTAG, EMAILS, WORDS]
 
 class TweetTokenizer():