-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathanalysis.py
196 lines (151 loc) · 7.89 KB
/
analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
########################################################################################################################
# Filename : analysis.py
# Autor : Anastasiia Sedova
# Version : 1.0
# Datum : Jan 21 2020
# Beschreibung : main file for analysis of user input
#
# Aufruf: analysis.py input_path
#
# Input
# input_path : path to .txt file where the person's answers are stored
#
#
# Output
# speaker_name : number of times person says his/her own name
# first_person_pronouns : number of personal pronouns
# pronouns : number of other pronouns
# bsolute_counter : number of absolute words
# swear_counter : number of swear words
# afinn_score : afinn score
# sentiment : sentiment score
########################################################################################################################
from afinn import Afinn
from Sentiment_analysis.sent_predict import predict
# from pos_and_proper_names_counter import POSAndProperNames
import argparse
import nltk
from nltk import word_tokenize
import nltk.data
import re
from nltk.stem import WordNetLemmatizer
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
def calculate_absolute_words(words):
"""
Looking for absolute words. Source: https://kddidit.com/2015/04/20/grammar-absolute-words/
"""
absolute = ("absolute", "absolutely", "astounding", "all", "always", "awful", "awfully", "complete", "completely",
"dead", "deadly", "gigantic", "gigantically", "entire", "entirely", "fatally", "impossibly",
"infinitely", "irrevocably", "giant", "entirely", "eternal", "eternally", "every", "everybody",
"everyone", "exhausted", "fantastic", "fatal", "final", "finite", "full", "fully", "impossible",
"infinite", "irrevocable", "hideous", "hideously", "horrible", "horribly", "never", "nobody", "no one",
"none", "overwhelmed", "perfect", "perfectly", "single", "supreme", "starving", "terrible", "terribly",
"terrifying", "total", "totally", "ultimate", "ultimately", "unique", "uniquely", "wonderful",
"wonderfully")
absolute_counter = len([x for x in words if x in absolute])
return absolute_counter
def calculate_afinn_score(sentence):
afinn = Afinn(language='en')
afinn_score = afinn.score(sentence)
return afinn_score
def calculate_swear_words(words):
"""
Looking for swear words. Source: https://en.wiktionary.org/wiki/Category:English_swear_words
"""
swear = ("arse", "ass", "asshole", "bitch", "bastard", "bollocks", "bugger", "child-fucker", "crap", "cunt", "damn",
"effing", "frigger", "fuck", "goddamn", "godsdamn", "hell", "horseshit", "motherfucker", "nigga", "nigger",
"shit", "shitty", "shitass", "slut", "piss", "prick", "twat", "piss", "pissed")
swear_counter = len([x for x in words if x in swear])
return swear_counter
def word_counting(sentence, speaker_name):
speaker_name_count = 0
first_person_pronouns = 0
other_pronouns = 0
# tagging
tagged_token = nltk.pos_tag(word_tokenize(sentence))
for token in tagged_token:
# NN = noun, singular 'desk', NNS = noun plural 'desks', NNPS = proper noun, plural 'Americans'
# if token[1] in ["NN", "NNS", "NNPS"]:
# noun += 1
if token[0] == speaker_name:
speaker_name_count += 1
if token[0] in ["i", "me", "myself" "my", "mine"]:
first_person_pronouns += 1
# PRP = personal pronoun (I, he, she), PRP$ = possessive pronoun (my, his, hers)
if token[1] in ["PRP", "PRP$"] and token[0] not in ["i", "me", "myself" "my", "mine"]:
other_pronouns += 1
return speaker_name_count, first_person_pronouns, other_pronouns
def print_results():
# print("Nouns: " + str(noun))
# print("Adjectives: " + str(adjective))
# print("Verbs: " + str(verb))
# print("The number of adverbs is " + str(adverb))
print("Other pronouns: " + str(pronouns))
print("Pronounce with references to one " + str(first_person_pronouns))
print("The proper name of the speaker: " + str(speaker_name))
print("The other names speaker says: " + str(other_names))
def analyse(input_path):
answers_sentences = [] # list of sentences represented as strings
answers_words = [] # list of all words containing in all answers
# read input data
with open(input_path) as file:
for paragraph in file:
if paragraph != "":
sentences = nltk.sent_tokenize(paragraph) # split the paraphraph into sentences
for sentence in sentences:
# preprocessing: all letters lower, exclude numbers and punctuations marks
clear_sentence = preprocessing(sentence)
answers_sentences.append(clear_sentence) # add sentence to the whole list of sentences
tokenized_sentence = word_tokenize(clear_sentence) # tokenization
answers_words.extend(tokenized_sentence)
print(len(answers_words))
file.close()
speaker_name = answers_words[0] # the first answer is always the proper name of the speaker
answers_words = answers_words[1:]
sentences = sentences[1:]
# POS and proper nouns counter
total_speaker_name = -1 # to eliminate the first answer on the question "What's your name"
total_first_person_pronouns = 0
total_other_pronouns = 0
total_afinn_score = 0
# calculate number of first-person pronouns
for sentence in answers_sentences:
# calculate the POS + name
sent_speaker_name, sent_first_person_pronouns, sent_other_pronouns = word_counting(sentence, speaker_name)
total_speaker_name += sent_speaker_name
total_first_person_pronouns += sent_first_person_pronouns
total_other_pronouns += sent_other_pronouns
# Calculate the afinn score
total_afinn_score += calculate_afinn_score(sentence)
total_speaker_name = max(0, total_speaker_name)
absolute_self_centrism = total_speaker_name + total_first_person_pronouns
rel_self_centrism = round(absolute_self_centrism / len(answers_words), 3)
final_afinn_score = round(total_afinn_score / len(answers_sentences), 3)
# calculate the number of absolute words
absolute_absolute_counter = calculate_absolute_words(answers_words)
rel_absolute_counter = round(absolute_absolute_counter/len(answers_words), 3)
# calculate the number of swear words
absolute_swear_counter = calculate_swear_words(answers_words)
rel_swear_counter = round(absolute_swear_counter/len(answers_words), 3)
# do sentiment analysis
sentiment = predict("Sentiment_analysis/trained_model.pt", input_path)
print("Afinn score: " + str(final_afinn_score))
print("A speaker says his/her own name: " + str(total_speaker_name))
print("References to oneself:", total_first_person_pronouns, "of", len(answers_words), "in total.")
print("Use of other pronouns:", total_other_pronouns, "of", len(answers_words), "in total.")
print("Swear Words:", absolute_swear_counter, "of", len(answers_words), "in total.")
print("Absolute Words:", absolute_absolute_counter, "of", len(answers_words), "in total.")
print("Sentiment is: " + str(sentiment))
return absolute_self_centrism, rel_self_centrism, rel_swear_counter, rel_absolute_counter, final_afinn_score, sentiment
def preprocessing(sentence):
sentence = sentence.lower()
sentence = re.sub("[!,.\-()\n]", "", sentence)
clear_sentence = re.sub("'", " ", sentence)
return clear_sentence
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("input_file", help="Text file containing person's answers on the questions.")
args = parser.parse_args()
analyse("data/" + args.input_file)