-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathadult_data.py
214 lines (175 loc) · 10.4 KB
/
adult_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
# AUTHOR: GEORGE BRADLEY
# LAST EDIT: 18/02/2021
# TITLE: CW_PART_1.PY
import numpy as np
import pandas as pd
from prettytable import PrettyTable
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn import preprocessing
# The read_data() function reads in csv and outputs a dataframe.
# Additionally a df with a specific column dropped can also be created.
def read_data(data_path, data_cols, excluded_columns = ''):
df = pd.read_csv(data_path, usecols=data_cols) # Reading in csv, specifiying which columns in particular
if len(excluded_columns): # If the exluded_columns input is not equal to 0 then:
df_dropped = df.drop([excluded_columns], axis = 1) # dropping the specified column and saving it to df_dropped
return df, df_dropped
else:
return df
# Function which returns information about the data within the dataframe
# THE INFORMATION BEING:
# - Number of instances
# - Number of missing values
# - Fraction of missing values over all attribute values
# - Number of instances with missing values
# - Fraction of instances with missing values over all instances
def data_information_gathering(data_frame):
num_missing_values = sum(data_frame.isnull().values.ravel()) # Storing number of missing values
total_values = data_frame.size # Storing the total number of instances
frac_missing_values = num_missing_values / total_values # Calculating the fraction of missing values to total values
num_instance = len(data_frame) # Storing number of instances within the dataframe
num_values = data_frame.count(1)
num_instances_missing_values = (num_values < len(data_frame.columns)).sum() # Calculating the number of instances which have missing values
frac_missing_instances = num_instances_missing_values / num_instance # Fraction of missing instances
# Formatting information into a PrettyTable
table = PrettyTable()
table.field_names = ["INFORMATION","VALUE"]
table.align["INFORMATION"] = "l"
table.add_row(["Number of instances:" , num_instance])
table.add_row(["Number of missing values:", num_missing_values])
table.add_row(["Fraction of missing values over all attribute values:", frac_missing_values])
table.add_row(["Number of instances with missing values:", num_instances_missing_values])
table.add_row(["Fraction of instances with missing values over all instances:", frac_missing_instances])
print(table)
# Encodes for each attribute and returns the set of encoded values
def cols_encoding(data_frame, to_print = True):
num_columns = len(data_frame.columns) # Storing number of columns
column_names = data_frame.columns # Storing the column names
encoded_cols = [] # List of lists of all the encoded column values
le = preprocessing.LabelEncoder() # Initialising LabelEncoder
for iter in range(num_columns): # Iterating over columns and encoding them.
# Checking the type of the column and handling accordingly
if data_frame[column_names[iter]].dtypes == object:
encoded = list(set(le.fit_transform(data_frame[column_names[iter]].astype(str))))
else:
encoded = list(set(le.fit_transform(data_frame[column_names[iter]])))
encoded_cols.append(encoded)
# If print is True the set of nominal values for each of the attributes will be printed in a table
if(to_print):
# Formatting information into a PrettyTable
table = PrettyTable()
table.field_names = ["ATTRIBUTE","NOMINAL VALUE"]
table.align["ATTRIBUTE"] = "l"
table.align["NOMINAL VALUE"] = "l"
for cols in range(len(encoded_cols)):table.add_row([data_frame.columns[cols] , encoded_cols[cols]])
print(table)
return encoded_cols
# A function called from create_decision_tree() function and used to encode the data used for training and testing.
def x_y_encoding(data_frame, target_column_index):
num_columns = len(data_frame.columns) # Calculate number of columns ie: rows
column_names = data_frame.columns # Storing all the column names
encoded_rows = [] # List of lists of all the encoded column values
le = preprocessing.LabelEncoder() # Initialising LabelEncoder
# Iterating over columns and encoding them.
for iter in range(num_columns): # Looping through the number of columns
# Checking the type of the column and handling accordingly
if data_frame[column_names[iter]].dtypes == object:
encoded = list(le.fit_transform(data_frame[column_names[iter]].astype(str)))
else:
encoded = list(le.fit_transform(data_frame[column_names[iter]]))
encoded_rows.append(encoded)
target_encoded_row = encoded_rows[target_column_index] # From the encoded columns targetting the target column
target_encoded_row = list(map(lambda el:[el], target_encoded_row)) # Making the target column into a list of lists.
del encoded_rows[target_column_index] # Deleting the target column from the encoded rows
encoded_rows = np.array(encoded_rows) # Converting to a np.array
transpose_rows = encoded_rows.T # Transposing the array
encoded_rows = transpose_rows.tolist() # Converting transposed to a list.
return encoded_rows, target_encoded_row
# Function which creates trains and tests a decision tree classifier
def create_decision_tree(data_frame, target_column, test_data_frame = pd.DataFrame(), ignore_missing = True):
# If neede ignores instances with missing values
if ignore_missing == True:
new_df = data_frame.dropna(axis = 0, how = 'any')
new_test_df = test_data_frame.dropna(axis = 0, how = 'any')
else:
new_df = data_frame
new_test_df = test_data_frame
target_column_index = data_frame.columns.get_loc(target_column) # Identifies the target column index within the dataframe.
if test_data_frame.empty:
X, Y = x_y_encoding(new_df, target_column_index) #Calling the x_y encoding fucntion which encodes the X Y data.
# Creating a test and training dataset, the test size is 30% of the total dataset.
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state = 0)
else:
X_train, y_train = x_y_encoding(new_df, target_column_index)
testing_data = new_test_df.loc[~new_test_df.index.isin(new_df.index)] # Ensuring that the no data which appears in training data is in test data
X_test, y_test = x_y_encoding(testing_data, target_column_index)
clf = tree.DecisionTreeClassifier(random_state = 0) # Initialising the decision tree classifier
clf.fit(X_train, y_train) # Fitting the tree model to the training data
# Computing the error rate of the decision tree
y_hat = clf.predict(X_test) # Predicting the class for the test data
count = 0 # Counting the number of correctly predicted classes
len_test_data = len(X_test)
for index in range(len_test_data):
if y_hat[index] == y_test[index]:
count += 1
score = 100 - ((count/len_test_data) * 100)
incorrect_predict = len_test_data - count
print("Out of " + str(len_test_data) + " values, " + str(count) + " were correctly predicted and " + str(incorrect_predict) + " were incorrectly predicted, resulting in " + str(round(score,2)) + "%" + " error rate")
# Function to return df with only missing values
def missing_val_df_split(data_frame):
missing_data_rows = data_frame[data_frame.isnull().any(axis=1)]
full_data_rows = data_frame.dropna()
return missing_data_rows, full_data_rows
# Randomly gets values from one df and adds to another used for creating D'
def add_random_df(data_frame1, data_frame2, amount = 0):
if amount == 0:
amount = len(data_frame1.index)
random_rows = data_frame2.sample(amount)
joint_dfs = pd.concat([data_frame1, random_rows])
return joint_dfs
# Filling missing values withing a dataframe with a specified value or with the mode value
def populate_df_na_val(data_frame, fill_value = "", mode = False):
data_frame_fill = data_frame.copy()
if mode == False and len(fill_value) > 0:
data_frame_fill.fillna(fill_value, inplace=True)
else:
columns = data_frame_fill.columns
for column in data_frame_fill.columns:
data_frame_fill[column] = data_frame_fill[column].fillna(data_frame_fill[column].mode().iat[0])
return data_frame_fill
# # # # # # # # # # # # #
# CALLING THE FUNCTIONS #
# # # # # # # # # # # # #
# Reading in the required csv file and creating the dataframes used for the rest of the solutions.
data_cols = ['age','workclass','education','education-num','marital-status','occupation','relationship','race','sex','capitalgain','capitalloss','hoursperweek','native-country','class']
df, df_dropped = read_data("data/adult.csv",data_cols, 'class')
# Question 1.1:
# Create a table in the report stating the following information about the adult data set:
# (i) number of instances,
# (ii) number of missing values,
# (iii) fraction of missing values over all attribute values,
# (iv) number of instances with missing values and
# (v) fraction of instances with missing values over all instances.
data_information_gathering(df)
# Question 1.2:
#Convert all 13 attributes into nominal using a Scikit-learn LabelEncoder.
# Then, print the set of all possible discrete values for each attribute.
cols_encoding(df_dropped)
# Question 1.3:
# Ignore any instance with missing value(s) and use Scikit-learn to build a decision tree
# for classifying an individual to one of the <= 50K and > 50K categories.
# Compute the error rate of the resulting tree
create_decision_tree(df,'class')
# Question 1.3:
#construct a smaller data set D0 from the original data set D, containing
# (i) all instances with at least one missing value and
# (ii) an equal number of randomly selected instances without missing values.
rows_with_missing_vals, full_data_rows = missing_val_df_split(df)
joint_dfs = add_random_df(rows_with_missing_vals, full_data_rows)
# Construct D1 by creating a new value “missing” for each attribute and using this value for every missing value in D0
filled_missing = populate_df_na_val(joint_dfs,"missing")
# Construct D2 by using the most popular value for all missing values of each attribute
filled_mode = populate_df_na_val(joint_dfs,mode = True)
# Train two decision trees with these two data sets and compare their error rates using instances from D for testing.
create_decision_tree(filled_missing, 'class', df)
create_decision_tree(filled_mode, 'class', df)