-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathEcoli_Houston.py
177 lines (134 loc) · 6.86 KB
/
Ecoli_Houston.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
# -*- coding: utf-8 -*-
"""
@author: Deep Patel
Chemical factors affecting the presence of E-coli bacteria in city’s waterways
"""
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
os.chdir("C:\\Users\\deepp\\Google Drive")
# Reading the excel file for Ecoli Data
ecoli= pd.read_excel('Ecoli Data.xlsx')
# Creating New Dataframe for Date and E.coli only
ecoli_date= ecoli[["Date Collected","E.coli"]]
Ecoli= ecoli_date.rename(columns={'Date Collected':'Date_Collected'})
Ecoli['Date_Collected']=pd.to_datetime(Ecoli['Date_Collected'])
Ecoli.dropna() #drops missing values
# Average amount of E-coli per year in Houston
Ecoli_Year_Avg= Ecoli.groupby(
[Ecoli.Date_Collected.dt.strftime('%Y')]
)['E.coli'].mean().reset_index(name='Ecoli Yearly Average')
print(Ecoli_Year_Avg)
# Bar graph of E-coli amount by year
plot1= Ecoli_Year_Avg.set_index('Date_Collected')
plot1a= plot1.plot.bar()
plot1a.set_xlabel("Year")
plot1a.set_ylabel("E-coli Average Amount (#/100mL)")
plt.title("Amount of E.coli per year in Houston")
#plt.savefig('Ecoli_Year.jpg') #To save graph in the current directory
print("---------------------------------------------------------------------")
Ecoli['Year']=Ecoli.Date_Collected.dt.year
Ecoli['Month']=Ecoli.Date_Collected.dt.strftime('%b')
Ecoli_Month_Avg= Ecoli.groupby(['Month','Year']
)['E.coli'].mean().reset_index(
name='Ecoli Monthly Average')
# Creating list to match for sorting the data by months
months=['Jan', 'Feb', 'Mar', 'Apr','May','Jun', 'Jul', 'Aug',
'Sep','Oct', 'Nov','Dec']
Ecoli_Month_Avg['Month'] = pd.CategoricalIndex(Ecoli_Month_Avg['Month'],
categories=months, ordered=True)
Ecoli_sort=Ecoli_Month_Avg.sort_values(["Month","Year"])
Ecoli_new= pd.pivot_table(Ecoli_sort,
index=["Month","Year"],
values=["Ecoli Monthly Average"])
# Average amount of E-coli per month in Houston (grouped by year)
print(Ecoli_new)
# Bar graph of each month compared from 2013 to 2016
plot2= Ecoli_new.unstack().plot(kind='bar', width=0.8)
plot2.set_xlabel("Month")
plot2.set_ylabel("Ecoli Average Monthly Amount (#/100mL)")
plt.legend(labels=['2013','2014','2015','2016'],loc='upper left')
plt.title("Amount of E.coli per month in Houston over different years ")
# plt.savefig('Ecoli_Month_grouped.jpg') #To save graph in current directory
#----------------------------------------------------------------------------
# Creating new Dataframe for chemical factors and E.coli
Ecoli_selected= ecoli[['E.coli','PH', 'Dissolved Oxygen','Phosphorous',
'Specific Conductance','Chloride', 'Ammonia Nitrogen',
'Nitrate Nitrogen','Sulfate']]
# checking the distribution of the features
plt.figure()
f, axes= plt.subplots(3,3)
sns.distplot(Ecoli_selected['E.coli'],bins=10,color='k', ax=axes[0,0])
sns.distplot(Ecoli_selected['PH'],bins=10,color='k',ax=axes[0,1])
sns.distplot(Ecoli_selected['Dissolved Oxygen'],bins=10,
color='k', ax=axes[0,2])
sns.distplot(Ecoli_selected['Phosphorous'],bins=10,color='k',ax=axes[1,0])
sns.distplot(Ecoli_selected['Specific Conductance'],bins=10,
color='k',ax=axes[1,1])
sns.distplot(Ecoli_selected['Chloride'],bins=10,color='k',ax=axes[1,2])
sns.distplot(Ecoli_selected['Ammonia Nitrogen'],bins=10,color='k',ax=axes[2,0])
sns.distplot(Ecoli_selected['Nitrate Nitrogen'],bins=10,color='k',ax=axes[2,1])
sns.distplot(Ecoli_selected['Sulfate'],bins=10,color='k',ax=axes[2,2])
# plt.savefig('Distributions.jpg')
# From the distribution plots we can see that E.coli, Specific Conductance,
# Chloride, Ammonia nitrogen, Sulfate are concerntrated at one point and not
# distributed well. Therefore, performing data transformation may be good idea.
# Data transformation on selected features and omitting missing values
Ecoli_clean= np.log(Ecoli_selected).diff().dropna()
corr= Ecoli_clean.corr(method="pearson")
#pd.set_option('display.max_columns',4)\
print("Correlation Matrix showing correlation between chemical factors "\
"and Ecoli in first column or first row")
print(corr) #prints correlation matrix
# Correlation Matrix heatmap using seaborn
plt.figure()
sns.heatmap(corr,
xticklabels=corr.columns.values,
yticklabels=corr.columns.values)
plt.title("Heatmap of Correlation Matrix")
# plt.savefig('Correlation_Matrix_Heatmap.jpg')
print("From correlation matrix and heatmap, we can see that the "\
"correlation between Ecoli and chemical factors are weak. "\
"The correlation between Ecoli and Specific condutance, between "\
"Ecoli and Chloride, and between Ecoli and Sulfate are moderate "\
"negative correlation wheareas correlation between Ecoli and "\
"Phosporous, and Ecoli and Ammonia Nitrogen is weakest.\n")
#-----------------------------------------------------------------------------
# Extra plots to see the relationship between chemical factors & E.coli
print("Additional analysis: The regression plots and pairs plot are just "\
"to further confirm the relationship patterns observed with "\
"correlation values")
#-----------------------------------------------------------------------------
# Based on the correlations, specific features are selected and
# Using regression plot to see the relationship
# Uncomment lines 143 to 162 to see the regression plots
# plt.figure()
# f, axes= plt.subplots(ncols=2)
# sns.regplot('PH','E.coli', data=Ecoli_clean,
# scatter_kws={'s':2},ax=axes[0])
# sns.regplot('Specific Conductance','E.coli', data=Ecoli_clean,
# scatter_kws={'s':2}, ax=axes[1])
# plt.title('Changes in log %s and log %s versus log %s'
# %('PH', 'Specific Conductance','E.coli'))
# #plt.savefig('Regplots1.jpg')
# plt.figure()
# f, axes= plt.subplots(ncols=2)
# sns.regplot('Chloride','E.coli', data=Ecoli_clean,
# scatter_kws={'s':2}, ax=axes[0])
# sns.regplot('Sulfate','E.coli', data=Ecoli_clean,
# scatter_kws={'s':2}, ax=axes[1])
# plt.title('Changes in log%s and log %s versus log %s'
# %('Chloride','Sulfate', 'E.coli'))
# #plt.savefig('Regplots2.jpg')
# Making new dataframe for pairs plot
Ecoli_select2= Ecoli_clean[['E.coli','PH',
'Specific Conductance','Chloride','Sulfate']]
# Pairs plot (Scatterplot matrix of selected chemical factors & Ecoli)
sns.pairplot(Ecoli_select2,diag_kind='kde')
plt.title("Pairs plot")
#plt.savefig('PairsPlot.jpg')
# From the regression plots and pairs plot, we can see that there is
# not sufficient evidence of relationship between Ecoli and selected features
# with chemical factors.