-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclassification_preparation.py
190 lines (164 loc) · 7.33 KB
/
classification_preparation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
# standard library dependencies
from collections import Counter
from typing import Mapping, List
# external dependencies
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
# local dependencies
from load_all_datasets import compile_datasets
from compute_deltas_df import get_deltas_df
from featureengineering import add_engineered_features
from find_best_features import find_best_features
def load_dataset_with_engineered_features(path_to_data_folder:str,
must_have_substring:str="cleaned_",
verbose:bool=False) -> Mapping[str,pd.DataFrame]:
"""
"""
name_to_df_dict = compile_datasets(path_to_data_folder,
must_have_substring=must_have_substring,
verbose=verbose
)
if verbose: print(f"loaded {len(name_to_df_dict)} datasets; adding engineered features")
for dataset_name, dataset_dataframe in name_to_df_dict.items():
name_to_df_dict[dataset_name] = add_engineered_features(dataset_dataframe)
if verbose: print("added engineered features to all datasets")
return name_to_df_dict
def return_data_to_predict_at_month(name_to_df_dict:Mapping[str,pd.DataFrame],
prediction_month:str,
starting_month:str="cleaned_01-2018.txt",
specific_months:List[str]=None,
verbose:bool=False):
"""
"""
assert prediction_month in name_to_df_dict.keys()
chrono_ordered_datasets = [
"cleaned_01-2018.txt",
"cleaned_02-2018.txt",
"cleaned_03-2018.txt",
"cleaned_04-2018.txt",
"cleaned_05-2018.txt",
"cleaned_06-2018.txt",
"cleaned_07-2018.txt",
"cleaned_08-2018.txt",
"cleaned_09-2018.txt",
"cleaned_10-2018.txt",
"cleaned_11-2018.txt",
"cleaned_12-2018.txt",
"cleaned_01-2019.txt",
"cleaned_02-2019.txt",
"cleaned_2019-03.txt",
"cleaned_2019-04.txt",
"cleaned_2019-05.txt",
"cleaned_2019-06.txt",
"cleaned_2019-07.txt",
"cleaned_2019-08.txt"
]
index_of_starting_month = chrono_ordered_datasets.index(starting_month)
index_of_prediction_month = chrono_ordered_datasets.index(prediction_month)
months_to_consider_in_delta = chrono_ordered_datasets[index_of_starting_month:index_of_prediction_month]
delta_df = get_deltas_df(
months_to_consider_in_delta,
name_to_df_dict
)
all_useable_patients = delta_df.index.tolist()
if verbose: print(f"{len(all_useable_patients)} useable patients")
# keeping only the patients for which we have delta-type data
data_at_month_to_predict = name_to_df_dict[prediction_month].loc[all_useable_patients]
all_data = pd.concat([ delta_df, data_at_month_to_predict ], axis=1)
return all_data
def prepare_to_predict_feature(feature_name:str,
all_useable_data:pd.DataFrame,
min_frequency:int=3,
verbose:bool=False):
"""
"""
assert feature_name in all_useable_data.keys()
y = all_useable_data.loc[:,feature_name]
label_distribution = Counter(y.values)
label_to_frequency_tuples = label_distribution.most_common()
for (label, frequency) in label_to_frequency_tuples[::-1]:
if frequency < min_frequency:
if verbose:
explanation = (
f"class {label} has too few instances to be split into training/validation/testing sets"
f"\nreplacing {label} labels with {label_to_frequency_tuples[0][0]} (the most common label)"
)
print(explanation)
y.iloc[ np.where( y == label ) ] = label_to_frequency_tuples[0][0]
features_df = all_useable_data.drop(columns=[feature_name])
assert feature_name not in features_df.columns
return features_df, y
def split_into_training_validating_testing( X:pd.DataFrame,
y:pd.DataFrame,
train_fraction:float=0.85,
test_fraction:float=0.15,
validation_fraction:float=0.2,
use_strat_splits:bool=False,
nsplits:int=10,
random_state:int=42,
verbose:bool=False):
"""
"""
assert train_fraction + test_fraction == 1.0
_, X_test_indices, _, y_test_indices = train_test_split(
np.arange(X.shape[0]),
np.arange(y.shape[0]),
train_size=train_fraction,
test_size=test_fraction,
random_state=random_state,
stratify=y.values
)
X_test_data = X.iloc[X_test_indices,:]
y_test_data = y.iloc[y_test_indices]
X_train_val_data = X.drop(X.index[X_test_indices])
y_train_val_data = y.drop(y.index[y_test_indices])
if use_strat_splits:
sss = StratifiedShuffleSplit(
n_splits=nsplits,
test_size=(train_fraction - (train_fraction*validation_fraction)),
random_state=random_state
)
all_split_dicts = []
for train_index, validation_index in sss.split(X_train_val_data, y_train_val_data):
X_train, X_val = X.values[train_index,:], X.values[validation_index,:]
y_train, y_val = y.values[train_index], y.values[validation_index]
# print(Counter(y_train), '\n', Counter(y_test))
all_split_dicts.append( {
"X train": X_train,
"X validate": X_val,
"X test": X_test_data,
"y train": y_train,
"y validate": y_val,
"y test": y_test_data
} )
return all_split_dicts
else:
X_train_indices, X_validation_indices, y_train_indices, y_validation_indices = train_test_split(
np.arange(X_train_val_data.shape[0]),
np.arange(y_train_val_data.shape[0]),
test_size=0.2,
random_state=42,
stratify=y_train_val_data
)
X_train = X_train_val_data.iloc[X_train_indices,:]
X_val = X_train_val_data.iloc[X_validation_indices,:]
y_train = y_train_val_data.iloc[y_train_indices]
y_val = y_train_val_data.iloc[y_validation_indices]
if verbose:
print(f"dimensions of testing dataset: {X_test_data.shape}, {y_test_data.shape}")
print(f"dimensions of validating dataset: {X_val.shape}, {y_val.shape}")
print(f"dimensions of training dataset: {X_train.shape}, {y_train.shape}")
print(f"dimensions of original dataset: {X.shape}")
assert X_train.shape[0] + X_val.shape[0] + X_test_data.shape[0] == X.shape[0]
assert y_train.shape[0] + y_val.shape[0] + y_test_data.shape[0] == y.shape[0]
return [{
"X train": X_train,
"X validate": X_val,
"X test": X_test_data,
"y train": y_train,
"y validate": y_val,
"y test": y_test_data
}]