linear_regression.py

# -*- coding: utf-8 -*-
"""Linear Regression.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1VNE302FFh6iDq1P4Xfm_3kXixso0Scr7
"""

#import libraries
import matplotlib.pyplot as plt
import numpy as np
from sklearn import linear_model, datasets
from sklearn.metrics import mean_squared_error, r2_score 
import json
import pandas as pd

with open("C:/Users/Steven/Downloads/reviews.json", encoding= "utf-8") as f:
    data= json.load(f)
print(data)

data["paper"][1]

data["paper"][0]["review"][0]["evaluation"]

reviews= []
for data in data["paper"]:
    for review in data["review"]:
        if review["lan"] == "es":
            reviews.append([review["evaluation"], review["text"]])

len(reviews)

df = pd.DataFrame(reviews, columns=["Evaluation", "Text"])

df["Evaluation"].unique()

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df["Text"], df["Evaluation"], test_size=0.20, random_state=42)

y_test

import nltk
nltk.download ("stopwords")
from nltk.corpus import stopwords
spanish_stopwords = stopwords.words("spanish")

#Vectorization: Bag of words
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
#create instance of CountVectorizer
vectorizer = CountVectorizer(stop_words=spanish_stopwords)
tf_vectorizer= TfidfVectorizer(stop_words= spanish_stopwords)

x_train_features= tf_vectorizer.fit_transform(X_train)
x_test_features= tf_vectorizer.transform(X_test)

x_train_features.toarray()

x_test_features.shape

#Vocabulary
vocab= tf_vectorizer.vocabulary_
#tokens
token= tf_vectorizer.get_feature_names()

len(token)

#create dataframe
cv_dataframe= pd.DataFrame(x_train_features.toarray(),columns=tf_vectorizer.get_feature_names())
#print(cv_dataframe)

# Create linear regression object
regr = linear_model.LinearRegression()

np.shape(x_train_features)

# Train the model using the training sets
regr.fit(x_train_features, y_train)

#coefficient of determination (𝑅²) 
regr.score(x_train_features, y_train)

# Make predictions using the testing set
y_pred = regr.predict(x_test_features)

# The mean squared error
print(mean_squared_error(y_test, y_pred))

# The coefficient of determination: 1 is perfect prediction
print(r2_score(y_test, y_pred))

#Plot outputs
plt.scatter(X_test, y_test,  color='black')
plt.plot(X_test, y_pred, color='blue', linewidth=2)

plt.xticks(())
plt.yticks(())

plt.show()

np.size(x_test_features)

text1= ["Este articulo presenta nuevos metodos para la creación y manejo de un software que ayude a prevenir errores en los softwares. La bibliografia muestra un gran nivel de comprención y además expresa con lenguaje adecuado su implementación. Se ha hecho una encuesta que evalua la efectividad al usar la maquinaria. La originalidad del trabajo es increible, presenta nuevas tecnicas que serán de gran ayuda al público",
"El documento muestra el desarrollo alternativo de sistemas informáticos. Los autores generan nuevas variaciones de un algorithmo que usa formular para mejorar el desarollo de la maquinaria. Es bastante útil y se entiende perfectamente, Sin embargo la notación de las formulas puede ser mejorada y las gráficas serían mas representativas si se usa tablas. La calidad de estas gráficas tampoco es adecuada, deberían ser más grandes y en otro formato.",
"El uso de tecnologías como LaTex es habitual en el mundo de la academia. Los autores muestran y describen la implementación. Sin embargo, algunos puntos son poco objetivos y se abusa el uso de adjetivos calificativos que generan mejor impacto del que deberia tener. Se deben cuidar aspectos del lenguaje. El valor que representa mejor el texto es el uso del lenguaje ya manejado y demuestra total dominio del mismo."]

vector= tf_vectorizer.transform(text1)
new_pred = regr.predict(vector)

new_pred
#Its like a score, we find s positive or negative correaltion between the feature and the labels. 
#Not a categorical problem so it will not be the same lavel but how close it is to certain 
#labels 1. would be more towards the label 1 and second it is also positive. last one is  more towars a minus one which is
#because the last example tends to be a bad review.
#We are interested in distinct or how close are to one label

#Assigment 2:
import numpy as np
print(sorted(regr.coef_)[:10])
print(sorted(regr.coef_)[-10:])

coeftofeature= [[regr.coef_[i],tok] for i, tok in enumerate(token)]

##ten with the minimun score
sorted(coeftofeature)[:10]

#10 with the maximun score
sorted(coeftofeature)[-10:]

##With 10 k fold
import numpy as np
from sklearn import linear_model, datasets
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
X_train= df["Text"]
y_train= df["Evaluation"]
X_train_feat= tf_vectorizer.fit_transform(X_train)

kf = KFold(n_splits=10, shuffle=False)
r2=0
best_r2=0
best_model = linear_model.LinearRegression()
for train, test in kf.split(X_train_feat, y_train):
    regr = linear_model.LinearRegression()
    regr.fit(X_train_feat[train], y_train[train])
    pred = regr.predict(X_train_feat[test])
    r2 += r2_score(pred, y_train[test])
    print(r2_score(pred, y_train[test]))
    if r2> best_r2:
        best_r2=r2
        best_model=regr
print('overall r2 score: ', r2/10)