-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdata.py
523 lines (381 loc) · 16.7 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
import datadotworld as dw
import numpy as np
import pandas as pd
################################################################################
################################################################################
## HELPER FUNCTIONS ##
################################################################################
################################################################################
def weekday_name(integer):
"""Convert integer from 0-6 to return corresponding weekday name.
Function takes an integer from dayofweek value for timestamp and
returns the name of the day of the week. For example: a day of
week value of 0 returns 'Monday'
Parameters
----------
integer: int (valid values range 0 to 6)
Returns
-------
Str: Returns the name of the day of the week.
Example
-------
>>> weekday_name(3) # returns Thursday
"""
day_names = ("Monday","Tuesday","Wednesday","Thursday",
"Friday","Saturday","Sunday")
return day_names[integer]
def month_name(integer):
"""Convert integer from timestamp month to return corresponding
month name.
Function takes integer from month value for timestamp and returns
corresponding name of month in calendar year (zero indexed) as a
string. For example, a timestamp with datetime.month value of 3
returns April.
Parameters
----------
integer: int (timestamp.month)
Returns
-------
Str: Name of month at correspnding place in list of months
zero indexed and in caldenar year order.
Example
-------
>>> month_name(12) # returns December
"""
month_names = ['January','February','March','April','May','June',
'July','August','September','October','November','December']
return month_names[integer-1]
def get_fiscalyear(column, fiscalyear_start=7):
"""Create conversion column that reads dates and returns fiscal year.
Function takes a datetime series object or column and produces a
list with the corresponding fiscal year as a four digit year for
each date in the original series.
The function's default value is based on the Maryland Govt fiscal
year which runs from July 1st (month 7) to June 30th. It returns
a list that is the same size as the original column making it easy
to simply use the return from the function call as a new column
for the same dataframe, adding data for fiscal year. The
fiscalyear_start parameter allows for generation of fiscal year data
for various months.
Parameters
-----------
column: pandas Series
a column of data type datetime.
fiscalyear_start: int
a number representing the numerical value for month of
the year to be used as the start of the fiscal year.
Returns
--------
Pandas Series: Series or column containing data for 4 digit number
representing fiscal year.
Examples
--------
>>> get_fiscalyear(df['request_date']) # assumes July fiscal year start
>>> get_fiscalyear(df['request_date'], fiscalyear_start=3) # assumes March
fiscal year start
>>> get_fiscalyear(df['request_date'], 10) # October fiscal year start
"""
fiscal_year = [date.year + 1 if date.month >= fiscalyear_start
else date.year for date in column]
return fiscal_year
class ValueNotTypeFloatIntListSetStr(ValueError):
pass
def valid_instance(value):
"""Checks whether a value is instance of float,int,list,set or str and returns Boolean.
Parameters
----------
value: float int list set or str
Any value passed to function such as 8, '8', [8,'8'], 8.0 or set([8,'8'])
Returns
-------
Boolean: True if value parameter is of type float, int, list set or str,
and returns False otherwise.
Example:
--------
>>> valid_instance("I'm good!") # returns True
>>> valid_instance({'name':'Joe Budden'}) # raises Error
"""
if (isinstance(value, float) or
isinstance(value, int) or
isinstance(value, list) or
isinstance(value, set) or
isinstance(value, str)):
return True
else:
### Valid value types include: [str, int, list, float]
raise ValueNotTypeFloatIntListSetStr(value)
def container_hasvalue(container, value):
"""Checks whether or not a value appears in a column and returns Boolean.
Parameters
----------
container: Pandas Series or list
A dataframe column or list of values.
value: Valid values include Int, Str, Float
A value for against which the list or series object can be searched
to determine its existence.
Returns
-------
Boolean: Returns True if value is in container object, False otherwise
"""
if valid_instance(value):
container = list(container)
container = set(container)
if value in container:
return True
else:
return False
else:
return False
class ValueNotTypeInt(ValueError):
pass
def valid_fiscalyear(series, year):
"""Checks whether or not a fiscalyear value appears in a column.
Parameters
----------
series: Pandas Series object
A dataframe column against which a check will be performed to determine
existence of a particular year value.
year: Int
A 4 digit fiscal year representing the value that will be searched for
in the series.
Returns
--------
Boolean: Returns True if year is in series, False if not.
Example
-------
>>> valid_fiscalyear(series=requestDate, year=2017) # returns True if 2017 is in
requestDate column
>>> valid_fiscalyear(requestDate, '2030') # returns ValueNotTypeInt error
"""
if isinstance(year, int):
return container_hasvalue(series, year)
else:
### year parameter expects value of type int
raise ValueNotTypeInt(year)
def filter_fiscalyear(dframe, column, fiscalyear):
"""Return filtered dataframe with a view of data for a single fiscalyear.
Function uses a fiscalyear parameter to slice a dataframe and provide
a dataset that excludes data for all fiscalyears except the one passed. It
accepts an integer representing a 4-digit year present in the target column.
Parameters
----------
dframe: Pandas Dataframe
column: Str
Column name or Pandas Series containing the fiscalyear data. Expects a
column with values similar in format to what is passed to the fiscalyear
parameter. For example, a series with values: [2015,2018,2016,2016,2016...]
fiscalyear: Int
4 digit year of one of the fiscalyears present in target column.
Returns
-------
Pandas Dataframe
Example
-------
>>> filter_fiscalyear(dframe, 'fy_completed')
>>> filter_fiscalyear(dframe, 'FYear_Requested', 2015)
"""
if valid_fiscalyear(series=dframe[column], year=fiscalyear):
try:
dataframe = dframe[(dframe[column] == fiscalyear)]
return dataframe
except Exception as e:
print(e) # convert to log
else:
print('Function failed to return filtered dataframe. Exited false vailid_fiscalyear')
def null_nonexistent(series):
"""Performs check to see if a column has null values and returns boolean.
Parameters
----------
series: Pandas Series
The column name of a dataframe
Returns
-------
Boolean: Returns True if there are no null values in column, False otherwise.
Example
-------
>>> null_nonexistent(dataframe.index) # returns True
"""
if series.isnull().sum() > 0:
return False
else:
return True
def ontime(dframe, problemtype_column='prob_type', duration_column='duration'):
"""Get information by type, on count, average duration and percentage of workorders
closed ontime.
Returns tupple of 3 Pandas Series objects with information on the percentage of
workorders on time for each problem type. Information returned at each position in
tupple is: 1) number of requests by problem type, 2) mean duration by problem type
3) percentage of workorders ontime by problem type.
Tupple allows for unpacking for elements required to create charts on workorders
closed ontime. For example, values for x, y and size of a scatter plot can be passed
as x = ontime(dframe)[0], y = ontime(dframe)[1], size = ontime(dframe)[2]
Parameters
----------
dframe: Pandas Dataframe
Expects a dataframe that has had all open workorders removed.
Filtering out open workorders removes null values from completion
and duration columns to support calculation of groupby and average
taking on dataframe.
problemtype_column: Str
Name of column containing data on the workorder type category.
duration_column: Str
Name of datetime type column containing duration for each workorder.
Returns
-------
Tupple: Return object has 3 items in tupple:
(total_volume_bytype, avg_duration_bytype, percentage_ontime)
Example
-------
>>> ontime(df,'prob_type','duration') # if duration has no null values returns tupple
>>> ontime(df, 'ptypes', 'drtn') # if null values present returns string:
'Check "drtn" column, ensure there are no null values.'
"""
if null_nonexistent(dframe[duration_column]):
try:
dframe['days_integer'] = dframe[duration_column].dt.days.astype(int)
dframe['average_duration'] = dframe.groupby(problemtype_column)['days_integer'].transform('mean')
dframe['ontime'] = np.where(dframe['days_integer'] <= dframe['average_duration'], 1, 0)
avg_duration_bytype = dframe.groupby(problemtype_column)['average_duration'].mean()
number_ontime_bytype = dframe.groupby(problemtype_column)['ontime'].sum()
total_volume_bytype = dframe.groupby(problemtype_column)['ontime'].count()
percentage_ontime = (number_ontime_bytype / total_volume_bytype) * 100
return (total_volume_bytype, avg_duration_bytype, percentage_ontime)
except Exception as e:
print(e) # convert to log
else:
return 'Check "{}" column, ensure there are no null values.'.format(duration_column)
def remove_open_workorders(dframe, column='date_completed'):
"""Return filtered dataframe removing workorder data missing completion dates.
Parameters
----------
dframe: Pandas Dataframe
column: Str
Name of datetime object column containing dates of work order
completion.
Returns
-------
Pandas Dataframe
Example
-------
>>> remove_open_workorders(dframe)
"""
dataframe = dframe[(dframe[column].notnull())]
return dataframe
################################################################################
################################################################################
## REQUEST DATA ##
################################################################################
################################################################################
def get_data(dataset_name, dataframe_name):
"""Request data from datadotworld API, and returns pandas dataframe.
Additional information on the datadotworld api can be found at the
following site: https://apidocs.data.world/api
Parameters
----------
dataset_name: str
name assigned to the desired dataset stored with
datadotworld service.
dataframe_name: str
name of the key associated with the datadotworld
dataset which stores objects within the dataset within
a dictionary of dataframes in key value pair.
Returns
-------
Pandas Dataframe
Examples
--------
>>> get_data(dataset_name='census2020', dataframe_name='Kansas')
>>> get_data('performance_indicators', 'public_safety')
"""
dataworld_obj = dw.load_dataset(dataset_name)
dataframe = dataworld_obj.dataframes[dataframe_name]
return dataframe
def clean_data(dframe):
'''Perform reindexing, cleaning and feature generation on dataframe for
dashboard.
Parameters
----------
dframe: Pandas Dataframe
Expects pandas dataframe returned from the get_data function. The
data.py module structure separates the request from the preparation
of the data into two functions: get_data and this clean_data
Returns
-------
Pandas Dataframe: Returns a new dataframe with releveant Series converted
to datetime objects, adds relevant features for duration
and resets the index.
Example
-------
>>> data = get_data(dataset_name='performance_indicators',
dataframe_name='public_safety')
clean_data(dframe=data)
'''
target_columns = (['wo_id','date_completed','prob_type','bl_id',
'completed_by','date_requested','time_completed',
'time_start','time_end'])
if isinstance(dframe, pd.core.frame.DataFrame):
try:
dframe = dframe[target_columns][(dframe['prob_type'] != 'TEST(DO NOT USE)')]
dframe['date_completed'] = pd.to_datetime(dframe['date_completed'])
dframe['date_requested'] = pd.to_datetime(dframe['date_requested'])
dframe.set_index('date_requested', inplace=True)
dframe['duration'] = dframe['date_completed'] - dframe.index
dframe['fiscal_year_requested'] = get_fiscalyear(dframe.index)
dframe['fiscal_year_completed'] = get_fiscalyear(dframe['date_completed'])
dframe.sort_index(inplace=True)
status = 'Pass'
except Exception as e:
status = 'Fail'
# log event if failed
else:
print('Function requires pandas dataframe object but received type: {}.'
.format(type(dframe)))
return dframe
def make_map_dframe(dframe, excelfile, skipnrows=6):
"""Generate dataframe for mapping charts with archibus data.
Parameters
----------
dframe: pandas dataframe
the dataframe returned from the clean_data function
in the data.py module
excelfile: file object
excel file with the latitude and longitude data required
to identify the location of markers for archibus data.
skipnrows: int
the number of rows to skip when reading in excel sheet
for conversion to pandas dataframe.
Returns
-------
Pandas Dataframe
Example
-------
>>> make_map_dframe(dframe=df, excelfile='file.xlsx', skipnrows=5)
"""
df = pd.read_excel(excelfile, skiprows=skipnrows)
df.columns = ['bl_id','name','addr','site_id','latitude','longitude']
geo_dict = {}
for bld in df['bl_id'].unique():
geo_dict[bld] = {'latitude': df.loc[df['bl_id'] == bld]['latitude'].values[0],
'longitude': df.loc[df['bl_id'] == bld]['longitude'].values[0],
'bld_name': df.loc[df['bl_id'] == bld]['name'].values[0]}
dframe['latitude'] = dframe['bl_id'].apply(lambda x: geo_dict[x]['latitude'])
dframe['longitude'] = dframe['bl_id'].apply(lambda x: geo_dict[x]['longitude'])
dframe['bld_name'] = dframe['bl_id'].apply(lambda x: geo_dict[x]['bld_name'])
return dframe
################################################################################
################################################################################
## MODULE VARIABLES ##
################################################################################
################################################################################
data = get_data(
dataset_name='dgs-kpis/fmd-maintenance',
dataframe_name='archibus_maintenance_data')
dframe = clean_data(data)
map_dframe = make_map_dframe(
dframe=dframe, excelfile='data/building_lat_longs.xlsx')
preventative_types = ['PREVENTIVE MAINT','HVAC|PM']
corrective_types = ['BOILER','CHILLERS','COOLING TOWERS','HVAC',
'HVAC INFRASTRUCTURE','HVAC|REPAIR']
hvac_types = ['PREVENTIVE MAINT','HVAC|PM','BOILER','CHILLERS',
'COOLING TOWERS','HVAC','HVAC INFRASTRUCTURE',
'HVAC|REPAIR' ]