-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathYahooTextCategorizationDemoBatch.py
72 lines (55 loc) · 2.07 KB
/
YahooTextCategorizationDemoBatch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import numpy as np
import keras
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from keras.datasets import imdb
from time import time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import csv
from scipy.sparse import csr_matrix, csc_matrix, lil_matrix
from PySparseCoalescedTsetlinMachineCUDA.tm import MultiClassTsetlinMachine
batches = 100
s = 1.0
T = 10000
clauses = 10000
print("READ")
f = open("/data/yahoo_answers_csv/train.csv", "r")
reader = csv.reader(f, delimiter=',', quotechar='"')
training_documents = []
training_y = []
for document in reader:
training_documents.append(" ".join(document[1:]))
training_y.append(int(document[0]))
f.close()
f = open("/data/yahoo_answers_csv/test.csv", "r")
reader = csv.reader(f, delimiter=',', quotechar='"')
testing_documents = []
testing_y = []
for document in reader:
testing_documents.append(" ".join(document[1:]))
testing_y.append(int(document[0]))
f.close()
print(len(training_documents))
vectorizer_X = CountVectorizer(binary=True, max_features=10000)
print("VECTORIZE")
X_train = vectorizer_X.fit_transform(training_documents)
feature_names = vectorizer_X.get_feature_names_out()
number_of_features = vectorizer_X.get_feature_names_out().shape[0]
Y_train = np.array(training_y)
X_test = vectorizer_X.transform(testing_documents)
Y_test = np.array(testing_y)
print("DONE")
epochs = 100
batch_size_train = Y_train.shape[0] // batches
tm = MultiClassTsetlinMachine(clauses, T, s, max_included_literals=32)
for i in range(epochs):
for batch in range(batches):
start_training = time()
tm.fit(X_train[batch*batch_size_train:(batch+1)*batch_size_train], Y_train[batch*batch_size_train:(batch+1)*batch_size_train], epochs=1, incremental=True)
stop_training = time()
start_testing = time()
result_test = 100*(tm.predict(X_test) == Y_test).mean()
stop_testing = time()
print("#%d Accuracy Test: %.2f%% Training: %.2fs Testing: %.2fs" % (i+1, result_test, stop_training-start_training, stop_testing-start_testing))