-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBMI_Insurance.py
135 lines (102 loc) · 5.09 KB
/
BMI_Insurance.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# -*- coding: utf-8 -*-
"""
Created on Thu Oct 22 01:57:26 2020
@author: Deep Patel
Characteristics of patients affecting the BMI and Insurance Expenses
Descriptive Statistical Analysis with Data Aggregation and
using numpy and pandas libraries
"""
import numpy as np
import pandas as pd
arr=np.loadtxt('insurance.txt', dtype={'names':('age','sex','bmi','children',
'smoker','region','expenses'),
'formats':[np.float,'S100',np.float,
np.float,'S100','S100',
np.float]},skiprows=1)
# Mean, standard deviation and median of age
age_mean= np.mean(arr['age'])
age_std= np.std(arr['age'])
age_median= np.median(arr['age'])
a1 = 'Basic analysis of AGE'
a2 = "The mean age is " + format(age_mean,'.2f') + '\n' \
"The median age is " + format(age_median,'.2f') + '\n' \
"The standard deviation of age is " + format(age_std,'.2f') + '\n'
a3 = "---------------------------------------------------------------------\n"
# Mean, standard deviation and median of BMI
BMI_mean= np.mean(arr['bmi'])
BMI_std= np.std(arr['bmi'])
BMI_median= np.median(arr['bmi'])
a4 = 'Basic analysis of BMI'
a5 = "The mean BMI is " + format(BMI_mean,'.2f') + '\n' \
"The median BMI is " + format(BMI_median,'.2f') + '\n' \
"The standard deviation of BMI is " + format(BMI_std,'.2f') + '\n'
a6 = "---------------------------------------------------------------------\n"
# Converting array to dataframe for ease in filtering
df=pd.DataFrame(arr)
df['smoker']=df['smoker'].str.decode('utf-8')
df['region']=df['region'].str.decode('utf-8')
df['sex']=df['sex'].str.decode('utf-8')
# Mean, standard deviation and median of BMI grouped by sex
a7 = "The mean, median and standard deviation of BMI for sex " \
"is as follows: \n"
Sex_Groupby= df.groupby(['sex']).agg({'bmi':['mean','median','std']}).round(2)
a8= Sex_Groupby.to_string() + '\n'
a9= "----------------------------------------------------------------------\n"
# Mean, standard deviation and median of BMI for smokers and non-smokers
Smoke_yes= df[(df['smoker']=='yes')]
Smoke_no= df[(df['smoker']=='no')]
a10="The mean, median and standard deviation of BMI of smokers is " \
"as follows: \n"
ysmoke= Smoke_yes.agg({'bmi':['mean','median','std']}).round(2)
a11= ysmoke.to_string() + '\n'
a12="The mean, median and standard deviation of BMI of non-smokers is " \
"as follows: \n"
nsmoke= Smoke_no.agg({'bmi':['mean','median','std']}).round(2)
a13= nsmoke.to_string() + '\n'
#Groupby
#df.groupby(['smoker']).agg({'bmi':['mean','median','std']})
a14= "---------------------------------------------------------------------\n"
# Mean, standard deviation and median of BMI grouped by region
a15= "The mean, median and standard deviation of BMI by region is " \
"as follows: \n"
Region_Groupby= df.groupby(['region']).agg({'bmi':['mean',
'median','std']}).round(2)
a16= Region_Groupby.to_string() + '\n'
a17= "---------------------------------------------------------------------\n"
# Mean, standard deviation and median of BMI of those w/ more than 2 childrens
two_children= df[(df['children']>2)]
a18= "The mean, median and standard deviation of BMI of those who have " \
"more than two children is as follows: \n"
two_child= two_children.agg({'bmi':['mean','median','std']}).round(2)
a19= two_child.to_string() + '\n'
a20="----------------------------------------------------------------------\n"
#Sorting by expenses (highest to lowest) in new dataframe
exp= df.sort_values(['expenses'], ascending=False)
exp_top20= exp.head(int(len(exp)*0.20))
exp_bottom80= exp.tail(int(len(exp)*0.80))
# Calculations for cases in top 20% of expenses.
a21= "The mean, median and standard deviation of BMI for the top 20% " \
"of the expenses as follows: \n"
expt20= exp_top20.agg({'bmi':['mean','median','std']}).round(2)
expt20_mode=exp_top20.agg({'smoker':['mode'],'region':['mode']})
a22= expt20.to_string() + '\n'
a23= expt20_mode.to_string() +'\n'
a24= "--------------------------------------------------------------------\n"
# Calculations for cases in bottom 80% of expenses.
a25= "The mean, median and standard deviation of BMI for the rest 80% " \
"of the expenses as follows: \n"
expb80= exp_bottom80.agg({'bmi':['mean','median','std']}).round(2)
expb80_mode=exp_bottom80.agg({'smoker':['mode'],'region':['mode']})
a26= expb80.to_string() + '\n'
a27= expb80_mode.to_string() + '\n'
# Storing the results as a single variable
results=(a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14,a15,a16,a17,a18,a19,
a20,a21,a22,a23,a24,a25,a26,a27)
# Creating a new file
results_file = open('results.txt', 'w')
# Writing the results variable to the file. Then closing the file.
with open('results.txt','a') as file:
for line in results:
results_file.write(str(line))
results_file.write("\n")
results_file.close()