-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathsentimental_analysis_reviews.py
105 lines (81 loc) · 2.8 KB
/
sentimental_analysis_reviews.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.collocations import BigramCollocationFinder
from nltk.collocations import BigramAssocMeasures
from nltk.collocations import TrigramCollocationFinder
from nltk.collocations import TrigramAssocMeasures
from nltk import pos_tag
import random
import re
import json
import sys
import pickle
def word_feats(words):
features = dict([(word, True) for word in words])
finder = BigramCollocationFinder.from_words(words)
finder.apply_word_filter(lambda x: False if re.match('\w', x) else True)
bigrams = finder.nbest(BigramAssocMeasures.chi_sq, 20000)
features.update(dict([(bigram, True) for bigram in bigrams]))
#finder = TrigramCollocationFinder.from_words(words)
#finder.apply_word_filter(lambda x: False if re.match('\w', x) else True)
#trigrams = finder.nbest(TrigramAssocMeasures.chi_sq, 20000)
#features.update(dict([(trigram, True) for trigram in trigrams]))
return features
#f = open('review_small.json', 'r')
f = open('review_50000.json', 'r')
pos_data = []
neg_data = []
def get_adjectives(words):
pos_words = pos_tag(words)
adjs = [w for w in pos_words if w[1] == 'JJ']
return adjs
def tokenize(sentence):
tokens = word_tokenize(sentence.lower())
return [w for w in tokens if not w in stopwords.words('english')]# and re.match('\w', w)]
read = 0
for line in f:
read+=1
if read%1000 == 0:
print read
line = eval(line)
#print line['text']
tokens = tokenize(line['text'])
features = word_feats(tokens)
if line['stars'] >= 3.5:
pos_data.append((features, 'pos'))
else:
neg_data.append((features, 'neg'))
#tokens = nltk.word_tokenize(line['text'])
#tagged = nltk.pos_tag(tokens)
#entities = nltk.chunk.ne_chunk(tagged)
#leaves = entities.leaves()
#print leaves
#nouns = [t[0] for t in leaves if t[1][:2] == "NN"]
#print nouns
f.close()
pos_size = len(pos_data)
neg_size = len(neg_data)
min_size = min(pos_size, neg_size)
if pos_size > min_size:
pos_data = random.sample(pos_data, min_size)
elif neg_size > min_size:
neg_data = random.sample(neg_data, min_size)
pos_limit = len(pos_data)*3/4
neg_limit = len(neg_data)*3/4
train_data = neg_data[:] + pos_data[:]
classifier = NaiveBayesClassifier.train(train_data)
f = open('my_classifier.pickle', 'wb')
pickle.dump(classifier, f)
f.close()
f_out = open('review_sentiment_50000.json', 'w')
f = open('review_50000.json', 'r')
read = 0
for line in f:
line = eval(line)
read+=1
output_class = classifier.classify(word_feats(tokenize(line['text'])))
line['predicted_sentiment'] = output_class
f_out.write(json.dumps(line)+"\n")
classifier.show_most_informative_features()