-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocessing.py
More file actions
157 lines (127 loc) · 4.61 KB
/
preprocessing.py
File metadata and controls
157 lines (127 loc) · 4.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# CHANTRE Honorine CHAH2807
# THOMAS Eliott THOE2303
import pandas as pd
import preprocessor as p
import spacy
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
def preprocess_tweet(row):
"""
Function that allows to preprocess tweet
:param row: a string containing the orignal tweet
:return text: the tweet preprocessed
"""
text = row['OriginalTweet']
new_text = ' '
final_text = ''
# We delete some punctuation for each row
for i in range(len(text)):
if text[i] not in [
'-', '.', 'Ã', '±', 'ã',
'¼', 'â', '»', '«', '§',
'$', "'", '(', ')', '+',
',', '=', '^', '`', '|', '~']:
new_text += text[i]
# We clean the tweet, delete : URLs, Hashtags, Mentions, Reserved words (RT, FAV), Emojis and Smileys
new_text = p.clean(new_text)
# We delete some common words for each row
for word in new_text.split(' '):
if word not in ['and', 'are']:
final_text += word + ' '
final_text = final_text[:-1]
return final_text
def give_number_to_class(row, original_class):
"""
Function that allows to give number to class instead of sentiment
:param row: a string containing the orignal sentiment
:param original_class: a boolean wich its true if we want 5 classes or false if we want 3 classes
:return -1, 0, 1 or -2, -1, 0, 1, 2: the number of the class
"""
sent = row['Sentiment']
if not original_class:
if sent == 'Extremely Negative' or sent == 'Negative':
return -1
elif sent == 'Neutral':
return 0
else:
return 1
else:
if sent == 'Extremely Negative':
return -2
elif sent == 'Negative':
return -1
elif sent == 'Neutral':
return 0
elif sent == 'Positive':
return 1
else:
return 2
def lemmatisation_spacy(text, nlp):
"""
lemmatising with spacy
:param text: a string containing the orignal tweet
:param nlp: a spacy object
:return out: the tweet lemmatised
"""
doc = nlp(text)
out = ""
for token in doc:
lemme = token.lemma_
out += lemme+" "
out = out[:-1]
return out
def lemmatisation_nltk(text):
"""
lemmatising with nltk
:param text: a string containing the orignal tweet
:return out: the tweet lemmatised
"""
lemmatizer = WordNetLemmatizer()
out = ""
text = text.split()
for word in text:
lemme = lemmatizer.lemmatize(word)
out += lemme+" "
out = out[:-1]
return out
def prepare_dataframe(file_name, original_class, lemmatising=None):
"""
Function that allows to prepare the two dataframe for models
:param file_name: a string containing the name of the file
:param original_class: a boolean witch is true if we want 5 classes or false if we want 3 classes
:param lemmatising: a boolean witch is true if we want lemmatising or false if we don't want
:return X_df: a dataframe with the preprocessed text tweet
:return Y_df: a dataframe with the preprocessed sentiment_number tweet
"""
# We save data of the csv file in a dataframe
data_df = pd.read_csv(file_name, sep=',', encoding='latin')
# We drop the column Location
data_df = data_df.drop(['Location'], axis=1)
# We drop the missing values
data_df.dropna(inplace=True)
# We drop the duplicates
data_df.drop_duplicates()
# We apply the preprocess_tweet function to the dataframe
data_df['OriginalTweet'] = data_df.apply(preprocess_tweet, axis=1)
# We compute the text len for each tweet
text_len = []
for text in data_df['OriginalTweet']:
tweet_len = len(text.split())
text_len.append(tweet_len)
data_df['text_len'] = text_len
# We apply the give_number_to_class function to the dataframe
data_df['Sentiment_Number'] = data_df.apply(lambda x: give_number_to_class(x, original_class), axis=1)
# We only keep the tweet with lenght > 4 characters
data_df = data_df[data_df['text_len'] > 4].reset_index()
X_df = data_df['OriginalTweet']
y_df = data_df['Sentiment_Number']
# Lemmatisation
# if lemmatising is different than spacy and nltk there is no lemmatisation:
if lemmatising == 'spacy':
nlp = spacy.load('en_core_web_sm')
tqdm.pandas()
X_df = X_df.progress_apply(lemmatisation_spacy, args=(nlp,))
elif lemmatising == 'nltk':
tqdm.pandas()
X_df = X_df.progress_apply(lemmatisation_nltk)
return X_df, y_df