-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpro_main.py
164 lines (140 loc) · 5.29 KB
/
pro_main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#This will read the CSV file
#Makes documents according to the sentiments
#happy.txt, sad.txt
#tokenization and lemmatization done on each of them
#tf-idf indexing done on each of them
#now a day's(assume 100) tweets will be stored and for each tweet we will score it
#on the basis of it's sentiment score
#each sentiment's score for a day will be stored
'''
Choice of stop words is very subjective
We will use the NLTK package's stop words list to remove stop words from out documents. TF IDF filters aren't good enough here.
Minimum term frequency removal can actually prove counter-productive here.
We will remove some words from the stop words list which seem to be contributing to the general sentiment
'''
'''
All these requirements lead us to use ISEAR (International Survey on Emotion Antecedents and Reactions) dataset in our
experiments. ISEAR consists of 7,666 sentences and snippets in which 1096 participants from fields of psychology, social
sciences, languages, fine arts, law, natural sciences, engineering and medical in 16 countries across five continents completed a
questionnaire about the experiences and reactions to seven emotions in everyday life including joy, fear, anger, sadness,
disgust, shame and guilt.
'''
import nltk
from nltk.corpus import stopwords
import csv
from nltk.stem.snowball import SnowballStemmer
import pickle
import math
import numpy as np
import re
from tf_idf import *
# This ia a function to remove non-ascii characters from input
def remove_non_ascii(text):
return ''.join(i for i in text if ord(i)<128)
# Preprocessing of a tweet
def processTweet(tweet):
tweet = tweet.lower()
tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
tweet = re.sub('@[^\s]+','AT_USER',tweet)
tweet = re.sub('[\s]+', ' ', tweet)
tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
tweet = tweet.strip('\'"')
return tweet
#Some words present in the original stopwords may be beneficial to us, so retain them
remove_from_stop=['not','are','can','will','no','nor','very','again','with','about','against','between','through','during'
'before','after','above','below','further','all','few','most','more','out','have','has','had','having']
#Usage list of stopwords
stop_list=[i for i in stopwords.words('english') if i[-3:]!="n't" and i not in remove_from_stop]
#Dictionaries for each emotion
joy = {}
fear = {}
anger = {}
sadness = {}
disgust = {}
shame = {}
guilt = {}
#Initializing NLTK's SnowballStemmer object
stemmer = SnowballStemmer("english")
#Reading the ISEAR dataset
reader=csv.reader(open('isear.csv','r'))
count=0
#Special characters that need to be stripped off
special_characters=['[',']','\\','/',',','"','@','#','.']
distinct_words=[] #A list of distinct words over the corpora
for row in reader:
l=row[0].split('|') #Split the row
emotion=l[36] #Emotion for the text
text=l[40] #Text of the document
text=remove_non_ascii(text) #Remove non ASCII
for i in special_characters: #Replacing special characters with blank
text=text.replace(i,"")
tokens=text.split(" ")
for i in tokens: #stem the tokens
if stemmer.stem(i) not in distinct_words:
distinct_words.append(stemmer.stem(i))
joy[stemmer.stem(i)]=0
fear[stemmer.stem(i)]=0
anger[stemmer.stem(i)]=0
sadness[stemmer.stem(i)]=0
disgust[stemmer.stem(i)]=0
shame[stemmer.stem(i)]=0
guilt[stemmer.stem(i)]=0
#print(text)
if(emotion=='joy'): #Frequency updation of stemmed words for each emotion document
joy[stemmer.stem(i)]+=1
elif(emotion=='fear'):
fear[stemmer.stem(i)]+=1
elif(emotion=='anger'):
anger[stemmer.stem(i)]+=1
elif(emotion=='sadness'):
sadness[stemmer.stem(i)]+=1
elif(emotion=='disgust'):
disgust[stemmer.stem(i)]+=1
elif(emotion=='shame'):
shame[stemmer.stem(i)]+=1
elif(emotion=='guilt'):
guilt[stemmer.stem(i)]+=1
print("Dictionaries prepared")
print(len(distinct_words)) tfDict = {}
wordcount=0
for key,val in wordDict.items():
wordcount+=val
for key,val in wordDict.items():
tfDict[key] = val/float(wordcount)
'''
with open('joy.pkl','wb') as f:
pickle.dump(joy,f)
with open('fear.pkl','wb') as f:
pickle.dump(fear,f)
with open('anger.pkl','wb') as f:
pickle.dump(anger,f)
with open('sadness.pkl','wb') as f:
pickle.dump(sadness,f)
with open('disgust.pkl','wb') as f:
pickle.dump(disgust,f)
with open('shame.pkl','wb') as f:
pickle.dump(shame,f)
with open('guilt.pkl','wb') as f:
pickle.dump(guilt,f)
with open('distinct.pkl','wb') as f:
pickle.dump(distinct_words,f)
'''
doc_list=[joy,fear,anger,sadness,disgust,shame,guilt]
print("Prepared Document List")
TF_IDF_VECTOR=[]
IDF=computeIDF(doc_list) #Calculate the IDF from the imported module
print("IDF calculated")
#2-D array initialisation
for word_count in range(len(distinct_words)):
temp=[0 for i in range(7)]
TF_IDF_VECTOR.append(temp)
print("2-D Array initialized")
#TF-IDF matrix being filled
for doc_iter in range(len(doc_list)):
temp_dict=computeTFIDF(computeTF(doc_list[doc_iter]),IDF)
for word_count in range(len(distinct_words)):
TF_IDF_VECTOR[word_count][doc_iter]=temp_dict[distinct_words[word_count]]
print(doc_iter)
with open('tf_idf.pkl','wb') as f:
pickle.dump(TF_IDF_VECTOR,f)
#Testing snippet