-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlinear_regression.py
154 lines (114 loc) · 5.29 KB
/
linear_regression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# -*- coding: utf-8 -*-
"""Linear Regression.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1VNE302FFh6iDq1P4Xfm_3kXixso0Scr7
"""
#import libraries
import matplotlib.pyplot as plt
import numpy as np
from sklearn import linear_model, datasets
from sklearn.metrics import mean_squared_error, r2_score
import json
import pandas as pd
with open("C:/Users/Steven/Downloads/reviews.json", encoding= "utf-8") as f:
data= json.load(f)
print(data)
data["paper"][1]
data["paper"][0]["review"][0]["evaluation"]
reviews= []
for data in data["paper"]:
for review in data["review"]:
if review["lan"] == "es":
reviews.append([review["evaluation"], review["text"]])
len(reviews)
df = pd.DataFrame(reviews, columns=["Evaluation", "Text"])
df["Evaluation"].unique()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df["Text"], df["Evaluation"], test_size=0.20, random_state=42)
y_test
import nltk
nltk.download ("stopwords")
from nltk.corpus import stopwords
spanish_stopwords = stopwords.words("spanish")
#Vectorization: Bag of words
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
#create instance of CountVectorizer
vectorizer = CountVectorizer(stop_words=spanish_stopwords)
tf_vectorizer= TfidfVectorizer(stop_words= spanish_stopwords)
x_train_features= tf_vectorizer.fit_transform(X_train)
x_test_features= tf_vectorizer.transform(X_test)
x_train_features.toarray()
x_test_features.shape
#Vocabulary
vocab= tf_vectorizer.vocabulary_
#tokens
token= tf_vectorizer.get_feature_names()
len(token)
#create dataframe
cv_dataframe= pd.DataFrame(x_train_features.toarray(),columns=tf_vectorizer.get_feature_names())
#print(cv_dataframe)
# Create linear regression object
regr = linear_model.LinearRegression()
np.shape(x_train_features)
# Train the model using the training sets
regr.fit(x_train_features, y_train)
#coefficient of determination (𝑅²)
regr.score(x_train_features, y_train)
# Make predictions using the testing set
y_pred = regr.predict(x_test_features)
# The mean squared error
print(mean_squared_error(y_test, y_pred))
# The coefficient of determination: 1 is perfect prediction
print(r2_score(y_test, y_pred))
#Plot outputs
plt.scatter(X_test, y_test, color='black')
plt.plot(X_test, y_pred, color='blue', linewidth=2)
plt.xticks(())
plt.yticks(())
plt.show()
np.size(x_test_features)
text1= ["Este articulo presenta nuevos metodos para la creación y manejo de un software que ayude a prevenir errores en los softwares. La bibliografia muestra un gran nivel de comprención y además expresa con lenguaje adecuado su implementación. Se ha hecho una encuesta que evalua la efectividad al usar la maquinaria. La originalidad del trabajo es increible, presenta nuevas tecnicas que serán de gran ayuda al público",
"El documento muestra el desarrollo alternativo de sistemas informáticos. Los autores generan nuevas variaciones de un algorithmo que usa formular para mejorar el desarollo de la maquinaria. Es bastante útil y se entiende perfectamente, Sin embargo la notación de las formulas puede ser mejorada y las gráficas serían mas representativas si se usa tablas. La calidad de estas gráficas tampoco es adecuada, deberían ser más grandes y en otro formato.",
"El uso de tecnologías como LaTex es habitual en el mundo de la academia. Los autores muestran y describen la implementación. Sin embargo, algunos puntos son poco objetivos y se abusa el uso de adjetivos calificativos que generan mejor impacto del que deberia tener. Se deben cuidar aspectos del lenguaje. El valor que representa mejor el texto es el uso del lenguaje ya manejado y demuestra total dominio del mismo."]
vector= tf_vectorizer.transform(text1)
new_pred = regr.predict(vector)
new_pred
#Its like a score, we find s positive or negative correaltion between the feature and the labels.
#Not a categorical problem so it will not be the same lavel but how close it is to certain
#labels 1. would be more towards the label 1 and second it is also positive. last one is more towars a minus one which is
#because the last example tends to be a bad review.
#We are interested in distinct or how close are to one label
#Assigment 2:
import numpy as np
print(sorted(regr.coef_)[:10])
print(sorted(regr.coef_)[-10:])
coeftofeature= [[regr.coef_[i],tok] for i, tok in enumerate(token)]
##ten with the minimun score
sorted(coeftofeature)[:10]
#10 with the maximun score
sorted(coeftofeature)[-10:]
##With 10 k fold
import numpy as np
from sklearn import linear_model, datasets
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
X_train= df["Text"]
y_train= df["Evaluation"]
X_train_feat= tf_vectorizer.fit_transform(X_train)
kf = KFold(n_splits=10, shuffle=False)
r2=0
best_r2=0
best_model = linear_model.LinearRegression()
for train, test in kf.split(X_train_feat, y_train):
regr = linear_model.LinearRegression()
regr.fit(X_train_feat[train], y_train[train])
pred = regr.predict(X_train_feat[test])
r2 += r2_score(pred, y_train[test])
print(r2_score(pred, y_train[test]))
if r2> best_r2:
best_r2=r2
best_model=regr
print('overall r2 score: ', r2/10)