-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmodeling_prep.py
More file actions
80 lines (62 loc) · 2.69 KB
/
modeling_prep.py
File metadata and controls
80 lines (62 loc) · 2.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit
from sklearn.model_selection import train_test_split
def county_grouped_shufflesplit(df: pd.DataFrame, train_size: float = 0.8):
"""
Performs group shuffle split on specified data. Groups by county via 'fips' column.
Parameters
----------
'df' : pd.DataFrame
contains the data to be split. Should just be an import of csv from .\processed_data, with date as index.
'train_size' : float
Returns
-------
'split_data_dict' a dict containing the shuffled split data, grouped by county.
"""
assert train_size < 1, 'Train size needs to be less than 1.'
assert train_size > 0, 'Train size needs to be greater than 0.'
split_data_dict = {}
df.dropna(subset=['score'], how='all', inplace=True)
X = df.iloc[:, (df.columns != 'score') & (df.columns != 'fips')]
y = df.iloc[:, df.columns == 'score']
county = df.iloc[:, df.columns == 'fips']
gss = GroupShuffleSplit(n_splits=1, test_size=0.2)
for train_x_index, test_x_index in gss.split(X=X,y=y, groups=county):
X_train = X.iloc[train_x_index,:]
X_test = X.iloc[test_x_index,:]
y_train = y.iloc[train_x_index,:]
y_test = y.iloc[test_x_index,:]
county_train = county.iloc[train_x_index,:]
county_test = county.iloc[test_x_index,:]
split_data_dict['X_train'] = X_train
split_data_dict['X_test'] = X_test
split_data_dict['y_train'] = y_train
split_data_dict['y_test'] = y_test
split_data_dict['county_train'] = county_train
split_data_dict['county_test'] = county_test
return split_data_dict
def train_test_split_default(df: pd.DataFrame, train_size: float = 0.8):
"""
Performs train test shuffle split on specified data.
Parameters
----------
'df' : pd.DataFrame
contains the data to be split. Should just be an import of csv from .\processed_data, with date as index.
'train_size' : float
Returns
-------
'split_data_dict' a dict containing the shuffled split data, grouped by county.
"""
assert train_size < 1, 'Train size needs to be less than 1.'
assert train_size > 0, 'Train size needs to be greater than 0.'
split_data_dict = {}
df.drop(columns=['fips'],inplace=True)
df.dropna(subset=['score'], how='all', inplace=True)
X = df.iloc[:, df.columns != 'score']
y = df.iloc[:, df.columns == 'score']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size)
split_data_dict['X_train'] = X_train
split_data_dict['X_test'] = X_test
split_data_dict['y_train'] = y_train
split_data_dict['y_test'] = y_test
return split_data_dict