-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSeverity_of_Inconsistent_Data.py
175 lines (142 loc) · 6.33 KB
/
Severity_of_Inconsistent_Data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
# -*- coding: utf-8 -*-
"""
Created on Sun Nov 1 09:36:57 2020
Authors: Deep Patel and Kenneth Hora
Determing Severity of Missing and Inconsistent Data
"""
# Imports the necessary libraries
import pandas as pd
#import numpy as np
import os
os.chdir("C:\\Users\\deepp\\Google Drive")
# Reads the file
training = pd.read_csv('titanic_traning.csv')
# Making the index of the test dataframe the same as the ID number
start_training_index = training["ID"][0]
end_training_index = training["ID"][len(training["ID"])-1]
training_index_list = list(range(start_training_index,
(end_training_index+1)))
training.index = training_index_list
n = len(training)
# Initilizes the table of missing/inconsistent values
MIV_table = pd.DataFrame()
MIV_table['Feature'] = training.columns[1:]
MIV_table['Missing Values (MV)'] = [0, 0, 0, 0, 0, 0, 0, 0]
MIV_table['% of MV (MV/n)'] = [0, 0, 0, 0, 0, 0, 0, 0]
MIV_table['Inconsistency Values (IV)'] = [0, 0, 0, 0, 0, 0, 0, 0]
MIV_table['% of IV (IV/n)'] = [0, 0, 0, 0, 0, 0, 0, 0]
# Making the index the same as the feature
MIV_table.index = training.columns[1:]
# Counts the missing values in each feature and calcualtes the % MV
for i in training.columns[1:]:
MIV_table.loc[[i], 'Missing Values (MV)'] = \
len(training[i][training[i].isnull()])
MIV_table.loc[[i], '% of MV (MV/n)'] = \
round(((MIV_table.loc[[i], 'Missing Values (MV)'])/n)*100, 3)
# Initilizes list of indeces of rows with MV or IV
MIV_index = []
# Counts the inconsistent values in each feature and tracks the indeces of IV
for i in training.index:
if training['pclass'][i] not in [1, 2, 3]:
MIV_table.loc[['pclass'], 'Inconsistency Values (IV)'] += 1
if i not in MIV_index:
MIV_index.append(i)
if training['sex'][i] not in ['male', 'female']:
MIV_table.loc[['sex'], 'Inconsistency Values (IV)'] += 1
if i not in MIV_index:
MIV_index.append(i)
if training['embarked'][i] not in ['C', 'Q', 'S']:
MIV_table.loc[['embarked'], 'Inconsistency Values (IV)'] += 1
if i not in MIV_index:
MIV_index.append(i)
if training['sibsp'][i] not in list(range(0,12)):
MIV_table.loc[['sibsp'], 'Inconsistency Values (IV)'] += 1
if i not in MIV_index:
MIV_index.append(i)
if training['parch'][i] not in list(range(0,12)):
MIV_table.loc[['parch'], 'Inconsistency Values (IV)'] += 1
if i not in MIV_index:
MIV_index.append(i)
if training['survived'][i] not in [0, 1]:
MIV_table.loc[['survived'], 'Inconsistency Values (IV)'] += 1
if i not in MIV_index:
MIV_index.append(i)
# Float values of age are treated as inconsistent. They will be trunked later
if training['age'][i] not in list(range(1,99)):
MIV_table.loc[['age'], 'Inconsistency Values (IV)'] += 1
if i not in MIV_index:
MIV_index.append(i)
# The IV for age needs to be corrected by subtracting MV because the method
# for finding IV in age counted both IV and MV.
MIV_table.loc[['age'], 'Inconsistency Values (IV)'] = \
MIV_table.loc[['age'], 'Inconsistency Values (IV)'] - \
len(training['age'][training['age'].isnull()])
import decimal
def drange(x, y, jump):
while x < y:
yield float(x)
x += decimal.Decimal(jump)
# Float values greater than 1 decimal place of fare are treated as
# inconsistent.
for i in training.index:
if training['fare'][i] not in list(drange(0,550,
decimal.Decimal('0.1'))):
MIV_table.loc[['fare'], 'Inconsistency Values (IV)'] += 1
if i not in MIV_index:
MIV_index.append(i)
# The IV for fare needs to be corrected by subtracting MV because the method
# for finding IV in age counted both IV and MV.
MIV_table.loc[['fare'], 'Inconsistency Values (IV)'] = \
MIV_table.loc[['fare'], 'Inconsistency Values (IV)'] - \
len(training['fare'][training['fare'].isnull()])
# Calculates the % of IV for each feature
for i in training.columns[1:]:
MIV_table.loc[[i], '% of IV (IV/n)'] = \
round(((MIV_table.loc[[i], 'Inconsistency Values (IV)'])/n)*100,3)
# age has NAs and floats (.5) and floats less than zero.
# fare has zero, NA and missing values...
# embarked just has Queenstown spelled out
# nothing wrong in sibsp
# nothing wrong in parch
# nothing wrong in survived
# Creates a dataframe of the M/I values.
MIV_frame = training.loc[MIV_index, training.columns]
# Creates replacement values for M/I values
fare_mean = round(training['fare'].mean(), 2)
age_mean = int(round(training['age'].mean(), 0))
sibsp_mean = int(round(training['sibsp'].mean(), 0))
parch_mean = int(round(training['parch'].mean(), 0))
pclass_mode = training['pclass'].mode()[0]
sex_mode = training['sex'].mode()[0]
embarked_mode = training['embarked'].mode()[0]
# Replaces missing values
training['fare']=training['fare'].fillna(fare_mean)
training['age']=training['age'].fillna(age_mean)
training['sibsp']=training['sibsp'].fillna(sibsp_mean)
training['parch']=training['parch'].fillna(parch_mean)
training['pclass']=training['pclass'].fillna(pclass_mode)
training['sex']=training['sex'].fillna(sex_mode)
training['embarked']=training['embarked'].fillna(embarked_mode)
# Replace inconsistent values
for i in training.index:
if training['sex'][i] == 'Male':
training.at[i, 'sex'] = 'male'
elif training['sex'][i] == 'Female':
training.at[i, 'sex'] = 'female'
for i in training.index:
if training['embarked'][i] == 'Queenstown':
training.at[i, 'embarked'] = 'Q'
# Truncates float values in age
for i in training['age']:
training['age'] = training['age']//1
# Writing the cleaned data to a .csv file
training.to_csv('cleaned_titanic.csv', index = False)
# Writes the MIV summary table and records of MIV to a .xlsx file
with pd.ExcelWriter('displays.xlsx') as writer:
MIV_table.to_excel(writer, sheet_name = 'MIV_table', index = False)
MIV_frame.to_excel(writer, sheet_name = 'MIV_frame', index = False)
workbook=writer.book
worksheet1=writer.sheets['MIV_table']
worksheet1.set_column('B:E',21)
worksheet2=writer.sheets['MIV_frame']
worksheet2.set_column('H:I',11)