|
| 1 | +time_range=15 |
| 2 | + |
| 3 | +import os |
| 4 | +#import netCDF4 |
| 5 | +import numpy as np |
| 6 | +import matplotlib.pyplot as plt |
| 7 | +import datetime |
| 8 | +import time |
| 9 | +import calendar |
| 10 | +from helpers.time_ranges import * |
| 11 | +from second_order import * |
| 12 | +import math |
| 13 | +import lightgbm as lg |
| 14 | +from sklearn.model_selection import * |
| 15 | +from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error, r2_score, mean_poisson_deviance, mean_gamma_deviance |
| 16 | +from sklearn.metrics import roc_curve, roc_auc_score, accuracy_score, precision_score, confusion_matrix, recall_score, f1_score, auc, matthews_corrcoef |
| 17 | + |
| 18 | +from sklearn.metrics import make_scorer |
| 19 | + |
| 20 | +from joblib import Parallel, delayed |
| 21 | +from sklearn.tree import * |
| 22 | + |
| 23 | +from helpers.fourierExtrapolation import * |
| 24 | +#from helpers.helpers import * |
| 25 | +from sklearn.ensemble import * |
| 26 | +from sklearn.linear_model import * |
| 27 | +import json |
| 28 | +import sys |
| 29 | +from sklearn.preprocessing import StandardScaler |
| 30 | +index = int(sys.argv[1]) |
| 31 | +#from spark_sklearn import GridSearchCV |
| 32 | + |
| 33 | + |
| 34 | +from joblib import Parallel, delayed |
| 35 | +def get_filepaths(directory): |
| 36 | + file_paths = [] # List which will store all of the full filepaths. |
| 37 | + # Walk the tree. |
| 38 | + for root, directories, files in os.walk(directory): |
| 39 | + for filename in files: |
| 40 | + # Join the two strings in order to form the full filepath. |
| 41 | + filepath = os.path.join(root, filename) |
| 42 | + file_paths.append(filepath ) # Add it to the list. |
| 43 | + return file_paths # Self-explanatory. |
| 44 | +def fit_classifier(svc_rbf, param, metric, X_train, y_train, X_test, y_test): |
| 45 | + clas = svc_rbf.set_params(**param) |
| 46 | + clas.fit(X_train, y_train) |
| 47 | + #return param, roc_auc_score(y_test, [a[1] for a in clas.predict_proba(X_test)] ) |
| 48 | + return param, metric(y_test, clas.predict(X_test) ) |
| 49 | +def gridsearch(X, y, svc_rbf, skf, metric=roc_auc_score, param_space= {}, n_jobs=-1): |
| 50 | + import json |
| 51 | + my_hash = {} |
| 52 | + i=0 |
| 53 | + #for train, test in skf.split(X, y): |
| 54 | + param_score = [] |
| 55 | + #with parallel_backend('spark'): |
| 56 | + param_score = Parallel(n_jobs=n_jobs)(delayed(fit_classifier)(svc_rbf, param, metric, X[train], y[train], X[test], y[test] ) for param in ParameterGrid(param_space) for train, test in skf.split(X, y)) |
| 57 | + #best_param, best_score = max(param_score, key=lambda x: x[1]) |
| 58 | + #print('best param this generation is {} with score {}.'.format(best_param, best_score)) |
| 59 | + for my_val in param_score: |
| 60 | + #key = [ (v, k) for k, v in my_val[0].iteritems ] |
| 61 | + key = json.dumps(my_val[0] ) |
| 62 | + if key not in my_hash.keys() or my_hash[key]==None: |
| 63 | + my_hash[key ] = my_val[1] |
| 64 | + else: |
| 65 | + my_hash[key ] = my_hash[key ] + my_val[1] |
| 66 | + #best_param, best_score = max(param_score, key=lambda x: x[1]) |
| 67 | + #print('Best scoring param is {} with score {}.'.format(best_param, best_score)) |
| 68 | + #i+=1 |
| 69 | + i = skf.get_n_splits() |
| 70 | + param_scores = [] |
| 71 | + for k, v in my_hash.items(): |
| 72 | + param_scores.append( (k, float(v/i) ) ) |
| 73 | + best_param, best_score = max(param_scores, key=lambda x: x[1]) |
| 74 | + print('Best scoring param is {} with score {}.'.format(best_param, best_score)) |
| 75 | + return best_param |
| 76 | + |
| 77 | +import glob |
| 78 | +def get_test(clf, X, y, X_test): |
| 79 | + return [ a for a in clf.fit(X,y).predict(X_test) ] |
| 80 | +def scatter_index(s, o): |
| 81 | + s_m = [a - np.mean(s) for a in s] |
| 82 | + o_m = [a-np.mean(o) for a in o] |
| 83 | + return math.sqrt(s_m-o_m)^2 / np.sum([a^2 for a in o]) |
| 84 | + |
| 85 | +full_file_paths = get_filepaths('./dataset/') |
| 86 | + |
| 87 | +import dask |
| 88 | +#dask.config.set(scheduler='processes') |
| 89 | +from joblib import parallel_backend |
| 90 | +from distributed import Client, progress, LocalCluster |
| 91 | + |
| 92 | + |
| 93 | +def my_custom_loss_func(ground_truth, predictions): |
| 94 | + #diff = np.abs(ground_truth - predictions).max() |
| 95 | + #return np.log(1 + diff) |
| 96 | + return np.corrcoef(ground_truth, predictions)[0,1] |
| 97 | + |
| 98 | +my_scorer = make_scorer(my_custom_loss_func, greater_is_better=True) |
| 99 | +def run(): |
| 100 | + global time_range |
| 101 | + #from helpers import helpers |
| 102 | + #from spark_sklearn import GridSearchCV |
| 103 | + for day in range(13, time_range): |
| 104 | + days = day |
| 105 | + print("------") |
| 106 | + print("day: " + str(days) ) |
| 107 | + X = [] |
| 108 | + y = [] |
| 109 | + trends = [] |
| 110 | + for fil in full_file_paths: |
| 111 | + #X, y = [], [] |
| 112 | + X_temp = [] |
| 113 | + y_temp = [] |
| 114 | + count = 0 |
| 115 | + with open(fil, "r") as f: |
| 116 | + #print(fil) |
| 117 | + lines = f.readlines()[1:] |
| 118 | + #if len(lines)>0: |
| 119 | + for line in lines: |
| 120 | + #print(line) |
| 121 | + val = line.strip().split(",") |
| 122 | + x = [float(val[a]) for a in range(len(val )- 1 ) ] #last one is date |
| 123 | + x.extend([float(a) for a in val[-1].split(" ")[0].split("/") ]) |
| 124 | + x.extend([ float(str(a)[0] + "." + str(a)[1]) for a in val[-1].split(" ")[1].split(":") ]) |
| 125 | + y_ = [float(val[a]) for a in range(len(val ) ) if a==index ] |
| 126 | + #if np.nan not in x and np.nan not in y: |
| 127 | + X_temp.append(x) |
| 128 | + y_temp.append(y_[0] ) |
| 129 | + |
| 130 | + x_val = X_temp |
| 131 | + y_val = y_temp |
| 132 | + #x_val = X_temp[:len(X_temp) - int(days*24*2)] |
| 133 | + #y_val = y_temp[int(days*24*2):] |
| 134 | + trend = [0 for a in y_val ] |
| 135 | + #print(len(x_val)) |
| 136 | + #print(len(y_val)) |
| 137 | + #print( len(y_val)) |
| 138 | + if len(y_val) == 0: # file with no element |
| 139 | + continue |
| 140 | + x_val, y_val, trend = difference(x_val, y_val, int(days*24*2) ) |
| 141 | + for a in range(0, len(x_val)): #remove first val coz trend=y[i+1]-y[i]. so no last val |
| 142 | + #print(x_val) |
| 143 | + if not np.isnan(x_val[a]).any() and not np.isnan(y_val[a]):# and y_val[a] < 20: |
| 144 | + #if all(a>-1000 for a in x_val[a]) and all(a<1000 for a in x_val[a]): |
| 145 | + X.append(x_val[a]) |
| 146 | + y.append(y_val[a]) |
| 147 | + trends.append(trend[a]) |
| 148 | + #X.extend(X_temp[:len(X_temp) - days*24*2]) |
| 149 | + #y.extend([a[0] for a in y_temp[days*24*2:] ] ) |
| 150 | + |
| 151 | + #timators':[ 100, ], y = X[:50000], y[:50000] |
| 152 | + print(len(X)) |
| 153 | + #print(trends[:10]) |
| 154 | + trends = np.array(trends) |
| 155 | + indices = [a for a in range(len(y)) ] |
| 156 | + print("mean: " + str(np.mean(y)) + " std: " + str(np.std(y)) + " range: " + str([min(y), max(y) ]) ) |
| 157 | + X_train, X_test, y_train, y_test, idx1, idx2 = train_test_split(X, y, indices, test_size=0.3, random_state=27) |
| 158 | + |
| 159 | + X_train, y_train, X_test, y_test = np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test) |
| 160 | + tr_trend = np.array(trends[idx1] ) |
| 161 | + te_trend =np.array( trends[idx2] ) |
| 162 | + scaler = StandardScaler() |
| 163 | + X_train = scaler.fit_transform(X_train) |
| 164 | + X_test = scaler.transform(X_test) |
| 165 | + print("train: " + str(len(X_train)) + ", test: " + str(len(y_test) ) ) |
| 166 | + grid1 = lg.LGBMRegressor(n_estimators=100, max_depth=None, random_state=100, n_jobs=-1) |
| 167 | + grid2 = ExtraTreesRegressor(random_state=100, max_depth=None, n_jobs=1) |
| 168 | + grid3 = DecisionTreeRegressor() |
| 169 | + grids = [('lgb', grid1),('et', grid2), ]#('dt', grid3) ] |
| 170 | + param2 = {'n_estimators':[ 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],# 170, 220, 260, 350, 450, 550, 650, 700, 800 ], |
| 171 | + 'min_samples_split':[2, 5, 10, 15, 25, 35, 45, 55, 65], |
| 172 | + 'min_samples_leaf':[100, 150, 200, 250, 300, 350, 400, 450, 500], |
| 173 | + } |
| 174 | + param1 = { 'n_estimators':[ 100, 200, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200], # 130, 200, 320, 370, 470, 540, 600, 700, ], |
| 175 | + 'num_leaves':[ 100, 150, 200, 250, 300, 350, 400, 450, 500, 750, 1000, ], #5, 10, 15, 20, 30, 40, 50, 60, 70, 90, 120, 130], |
| 176 | + 'min_child_samples':[ 5, 10, 20, 30, 50, 75, 100, 150, 200,], |
| 177 | + } |
| 178 | + param3 = { 'n_estimators':[ 100, ], |
| 179 | + 'num_leaves':[5, 10, 15, 20, 30, 40, 50, 60, 70], |
| 180 | + 'min_child_samples':[ 5, 15, 25, 30, 35, 40, ], |
| 181 | + } |
| 182 | + |
| 183 | + c_range = np.linspace(-5,15,num=25) |
| 184 | + C_range = [math.pow(2,i) for i in c_range] |
| 185 | + #param3 = {}#{'alpha': C_range, } |
| 186 | + |
| 187 | + params = [param1, param2, param3] |
| 188 | + for gr in range(len(grids)): |
| 189 | + print("classifier: " + grids[gr][0]) |
| 190 | + grid = grids[gr][1] |
| 191 | + #my_f = BlockingTimeSeriesSplit(n_splits=3) |
| 192 | + my_f = KFold(n_splits=3) |
| 193 | + X, y = np.array(X_train), np.array(y_train) |
| 194 | + y_ori = y.copy() |
| 195 | + param = {} |
| 196 | + #with parallel_backend('spark'): |
| 197 | + #param = gridsearch(X, y, grid, my_f, metric=r2_score, param_space= params[gr], n_jobs=-1 ) |
| 198 | + #param = json.loads(param) |
| 199 | + grid = grid.set_params(**param) |
| 200 | + grid = GridSearchCV( grid, params[gr], scoring='neg_mean_squared_error', n_jobs=-1, cv=3) |
| 201 | + #with parallel_backend('spark'): |
| 202 | + grid.fit(X, y ) |
| 203 | + #print("best score: "+ str(grid.best_score_) + " best params: "+ str(grid.best_params_) ) |
| 204 | + if grids[gr][0] != 'linear regression': |
| 205 | + param = {'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900],}# 'n_jobs':[-1]} |
| 206 | + print("best score: "+ str(grid.best_score_) + " best params: "+ str(grid.best_params_) ) |
| 207 | + grid = grid.best_estimator_ |
| 208 | + #if grids[gr][0] != 'et': |
| 209 | + # param = {'n_estimators': [100, 150, 200, 250, 300, 350, 400, 450, 500, ], } |
| 210 | + # grid = GridSearchCV( grid, param, scoring='r2', n_jobs=1, cv=3 ) |
| 211 | + #else: |
| 212 | + #param = {'n_estimators': 500, 'n_jobs': -1} |
| 213 | + grid.set_params(**param) |
| 214 | + #else: |
| 215 | + # print("best score: "+ str(grid.best_score_) + " best params: "+ str(grid.best_params_) ) |
| 216 | + predicted_1 = Parallel(n_jobs=-1)(delayed(get_test)(grid, X_train[train], y_train[train], X_train[test] ) for train,test in my_f.split(X_train, y_train) ) |
| 217 | + predicted_1 = [item for sublist in predicted_1 for item in sublist] |
| 218 | + |
| 219 | + y_selected = [] |
| 220 | + y_ori = [y_train[a] +tr_trend[a] for a in range(len(y_train)) ] |
| 221 | + y_ori = np.array(y_ori) |
| 222 | + predict_ = [] |
| 223 | + |
| 224 | + for tr, te in my_f.split(X_train, y_train): |
| 225 | + y_tes = y_ori[te] |
| 226 | + y_selected.extend(y_tes) |
| 227 | + predict_.extend([ predicted_1[a]+ tr_trend[a] for a in te ]) |
| 228 | + y_ = y_selected |
| 229 | + predicted_1 = predict_ |
| 230 | + |
| 231 | + #predicted_1 = cross_val_predict(grid, X, y, cv=5, n_jobs=-1) |
| 232 | + var = explained_variance_score(y_, predicted_1) |
| 233 | + abs_err = mean_absolute_error(y_, predicted_1) |
| 234 | + sq_err = mean_squared_error(y_, predicted_1) |
| 235 | + #r2 = r2_score(y, predicted_1) |
| 236 | + r2 = 1-(1-r2_score(y_, predicted_1))*((len(X_train)-1)/(len(X_train)-len(X_train[0])-1)) |
| 237 | + mean_y, mean_pred = np.mean(y_), np.mean(predicted_1) |
| 238 | + #si = scatter_index(predicted_1, y) #observation put on end |
| 239 | + si = math.sqrt(np.mean([ ( -y_[a] + mean_y - mean_pred + predicted_1[a])**2 for a in range(len(y_)) ]) )/np.mean(y_) |
| 240 | + nse = 1 - sum([(predicted_1[a] -y[a])**2 for a in range(len(y_)) ])/sum([ (y_[a]-mean_y)**2 for a in range(len(y_)) ]) |
| 241 | + bias = np.mean([y_[a]-predicted_1[a] for a in range(len(y_)) ]) |
| 242 | + hh = math.sqrt(sum([(y_[a]-predicted_1[a])**2 for a in range(len(y_)) ])/sum([y_[a]*predicted_1[a] for a in range(len(y_)) ]) ) |
| 243 | + print("training") |
| 244 | + #print("mean: " + str(np.mean(y)) + " std: " + str(np.std(y)) + " range: " + str([min(y), max(y) ]) ) |
| 245 | + print("sq_err: " + str(math.sqrt(sq_err)) + " abs_err: " + str(abs_err) + ' var: ' + |
| 246 | + str(var) + ' r2: ' + str(r2) + " si: " + str(si) + ' nse: ' + str(nse) + |
| 247 | + ' cc: ' + str(np.corrcoef(y_, predicted_1)[0,1] ) + ' bias: ' + str(bias) + |
| 248 | + ' hh: ' + str(hh) ) |
| 249 | + |
| 250 | + print("test") |
| 251 | + param= {'n_jobs':-1} |
| 252 | + grid.set_params(**param) |
| 253 | + #with parallel_backend('spark'): |
| 254 | + grid.fit(X_train, y_train) |
| 255 | + predicted_1 = grid.predict(X_test) |
| 256 | + y_t = [y_test[a] +te_trend[a] for a in range(len(y_test)) ] |
| 257 | + predicted_1 = [predicted_1[a]+ te_trend[a] for a in range(len(predicted_1)) ] |
| 258 | + var = explained_variance_score(y_t, predicted_1) |
| 259 | + abs_err = mean_absolute_error(y_t, predicted_1) |
| 260 | + sq_err = mean_squared_error(y_t, predicted_1) |
| 261 | + #r2_score = r2_score(y_, predicted_1) |
| 262 | + r2 = 1-(1-r2_score(y_t, predicted_1))*((len(X_test)-1)/(len(X_test)-len(X_test[0])-1)) |
| 263 | + mean_y, mean_pred = np.mean(y_t), np.mean(predicted_1) |
| 264 | + #si = math.sqrt(sq_err)/np.mean(predicted_1) |
| 265 | + #si = math.sqrt(sum([ ( -y_[a] + mean_y- mean_pred + predicted_1[a])**2 ]) /sum([b**2 for b in y_]) ) |
| 266 | + si = math.sqrt(np.mean([ ( -y_t[a] + mean_y - mean_pred + predicted_1[a])**2 for a in range(len(y_t)) ] ) )/np.mean(y_t) |
| 267 | + nse = 1 - sum([(predicted_1[a] -y_t[a])**2 for a in range(len(y_t)) ])/sum([ (y[a]-mean_y)**2 for a in range(len(y_t)) ]) |
| 268 | + bias = np.mean([y_t[a]-predicted_1[a] for a in range(len(y_t)) ]) |
| 269 | + hh = math.sqrt(sum([(y_t[a]-predicted_1[a])**2 for a in range(len(y_t)) ])/sum([y_[a]*predicted_1[a] for a in range(len(y_t)) ]) ) |
| 270 | + print("sq_err: " + str(math.sqrt(sq_err)) + " abs_err: " + str(abs_err) + ' var: ' + |
| 271 | + str(var) + ' r2: ' + str(r2) + " si: " + str(si) + ' nse: ' + str(nse) + |
| 272 | + ' cc: ' + str(np.corrcoef(y_t, predicted_1)[0,1] ) + ' bias: ' + str(bias) + |
| 273 | + ' hh: ' + str(hh) ) |
| 274 | + |
| 275 | + |
| 276 | +if __name__ == "__main__": |
| 277 | + #with parallel_backend('spark'): |
| 278 | + run() |
| 279 | + |
| 280 | + |
0 commit comments