-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathWikiBayesianProfilingDemo.py
85 lines (64 loc) · 2.31 KB
/
WikiBayesianProfilingDemo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import numpy as np
import keras
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from keras.datasets import imdb
from time import time
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csc_matrix, csr_matrix, dok_array
import sys
import pickle
print(sys.version)
#target_word = 'awful' #'frightening'#'comedy'#'romance'#"scary"
#target_word='romance'
#target_word='comedy'
#target_word = 'brilliant'
#target_word = 'frightening'
target_word = 'king'
min_frequency = 5
prior = 250
threshold = 1.0
profile_words = 10000
NUM_WORDS=25000
INDEX_FROM=2
f = open("/Users/oleg/Dropbox/Research/Datasets/wiki-english-20171001")
sentences = f.read().split("\n")
f.close()
vectorizer_X = CountVectorizer(min_df=min_frequency, max_features=NUM_WORDS, binary=True)
X = vectorizer_X.fit_transform(sentences)
f_vectorizer_X = open("wiki_vectorizer_X.pickle", "wb")
pickle.dump(vectorizer_X, f_vectorizer_X, protocol=4)
f_vectorizer_X.close()
f_X = open("wiki_X.pickle", "wb")
pickle.dump(X, f_X, protocol=4)
f_X.close()
#print("Loading Vectorizer")
#f_vectorizer_X = open("wiki_vectorizer_X.pickle", "rb")
#vectorizer_X = pickle.load(f_vectorizer_X)
#f_vectorizer_X.close()
#print("Loading Data")
#f_X = open("wiki_X.pickle", "rb")
#X = pickle.load(f_X)
#f_X.close()
X_csc = csc_matrix(X)
feature_names = vectorizer_X.get_feature_names_out()
number_of_features = vectorizer_X.get_feature_names_out().shape[0]
target_id = vectorizer_X.vocabulary_[target_word]
word_count = np.array(X.sum(axis=0)).reshape(-1)
word_p = word_count / word_count.sum()
print(word_count.shape)
sorted_ids = np.argsort(-1*word_count)
f = open("Wiki_Bayesian_profile_%d_%d_%.2f_%d.vec" % (NUM_WORDS, prior, threshold, profile_words), "w+")
f.write("%d %d\n" % (X.shape[1], profile_words))
for i in range(number_of_features):
if i%100 == 0:
print(i)
word_score = np.array(X[(X_csc.getcol(i)==1).toarray()[:,0]].sum(axis=0)).reshape(-1)
word_score = word_score / word_score.sum()
word_score = np.log(word_score / word_p)
word_score *= word_count / (prior + word_count)
word_score = np.where(word_score >= threshold, word_score, 0)
word_score = word_score[sorted_ids[:profile_words]]
f.write(feature_names[i] + " ")
np.savetxt(f, word_score.reshape((1,-1)), delimiter=' ', fmt='%.2f')
f.close()