-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutils.py
90 lines (76 loc) · 3.4 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import time
import json
from collections import defaultdict
from array import array
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math
import numpy as np
import collections
import string
import re
import matplotlib.pyplot as plt
import nltk
nltk.download('stopwords')
def build_terms(line):
"""
Preprocess the article text (title + body) removing stop words, stemming,
transforming in lowercase and return the tokens of the text.
Argument:
line -- string (text) to be preprocessed
Returns:
line - a list of tokens corresponding to the input text after the preprocessing
"""
stemmer = PorterStemmer()
stop_words = set(stopwords.words("english"))
url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
line = url_pattern.sub(r'', line)
emoji_pattern = re.compile(pattern="["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F700-\U0001F77F" # alchemical symbols
u"\U0001F780-\U0001F7FF" # Geometric Shapes Extended
u"\U0001F800-\U0001F8FF" # Supplemental Arrows-C
u"\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs
u"\U0001FA00-\U0001FA6F" # Chess Symbols
u"\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A
u"\U00002702-\U000027B0" # Dingbats
"]+", flags=re.UNICODE)
line = emoji_pattern.sub(r'', line)
line= line.lower() ## Transform in lowercase
line= line.split() ## Tokenize the text to get a list of terms
#line = [term.strip(string.punctuation) if term[0] != '#' else term[1:] for term in line] #Identify hashtags and delete the symbol
line = [term.strip(string.punctuation) for term in line] #Remove puntuaction signs, # included
line = [word for word in line if word not in stop_words] ## Eliminate the stopwords (HINT: use List Comprehension)
line = [stemmer.stem(word) for word in line] ## perform stemming (HINT: use List Comprehension)
return line
def read_tweets(file_path):
with open(file_path) as fp:
lines = fp.readlines()
lines = [l.strip().replace(' +', ' ') for l in lines]
print("There are ", len(lines), " tweets")
return lines
def get_tweet_info(tweet):
tweet_id = tweet['id_str']
date = tweet['created_at']
text = tweet['full_text']
hashtags = [tag['text'] for tag in tweet['entities']['hashtags']]
likes = tweet['favorite_count']
retweets = tweet['retweet_count']
url = f"https://twitter.com/user_name/status/{tweet_id}"
tweet_info = f"Date: {date}\nText: {text}\nHashtags: {hashtags}\nLikes: {likes}\nRetweets: {retweets}\nURL: {url}\n{'-'*50}"
return ({
'Tweet': text,
'Date': date,
'Hashtags': hashtags,
'Likes': likes,
'Retweets': retweets,
'Url': url
}, tweet_info)
def get_tweet(tweet_id, tweets):
for tweet in tweets:
tweet = json.loads(tweet)
if tweet['id'] == (tweet_id):
return get_tweet_info(tweet)
return None