-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathEDA with SQL
46 lines (34 loc) · 2.57 KB
/
EDA with SQL
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#Coded on the IBM Db2 Cloud platform for use with their services. Code might work differently using different interfaces.
!pip install --force-reinstall ibm_db==3.1.0 ibm_db_sa==0.3.3
!pip uninstall sqlalchemy==1.4 -y && pip install sqlalchemy==1.3.24
!pip install ipython-sql
%load_ext sql
#----------------------------------------------------------------------------------------------------------------------------
# Remember the connection string formatted like so:
# %sql ibm_db_sa://my-username:my-password@hostname:port/BLUDB?security=SSL
# Enter the connection string for your Db2 on Cloud database instance below
# i.e. copy after db2:// from the URI string in Service Credentials of your Db2 instance. Remove the double quotes at the end.
#----------------------------------------------------------------------------------------------------------------------------
%sql ibm_db_sa://zpl22708:QPqmqmuvC91F5Vwp@9938aec0-8105-433e-8bf9-0fbb7e483086.c1ogj3sd0tgtu0lqde00.databases.appdomain.cloud:32459/bludb?authSource=admin&replicaSet=replset
import pandas
chicago_socioeconomic_data = pandas.read_csv('https://data.cityofchicago.org/resource/jcxq-k9xf.csv')
%sql PERSIST chicago_socioeconomic_data
#First, a count of how many rows we have to choose from.
%sql SELECT COUNT(*) FROM chicago_socioeconomic_data;
#Then we search for how many defined community areas have a hardship index greater than 50.0, or any other number you want.
%sql SELECT COUNT(*) FROM chicago_socioeconomic_data WHERE hardship_index > 50.0;
#To give context to the hardship index of 50.0 we look for the max value in that set with the following.
%sql SELECT MAX(hardship_index) FROM chicago_socioeconomic_data;
#And now we can search for which community area has the highest hardship index
%sql SELECT community_area_name FROM chicago_socioeconomic_data where hardship_index=98.0
#The following allows us to search for income per community area. Enter any number you wish, but we're looking for more than $60,000.
%sql SELECT community_area_name FROM chicago_socioeconomic_data where per_capita_income_ > 60000;
#Now we can create a scatterplot with per capita income vs. hardship index for all community areas.
# if there is a module not found error try unhashing the following line to install seaborn. Otherwise, continue on.
#!pip install seaborn==0.9.0
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
income_vs_hardship = %sql SELECT per_capita_income_, hardship_index FROM chicago_socioeconomic_data;
plot = sns.jointplot(x='per_capita_income_',y='hardship_index', data=income_vs_hardship.DataFrame())
#End