Skip to content

Commit bde04da

Browse files
committed
test
1 parent 6212ac2 commit bde04da

File tree

1 file changed

+280
-0
lines changed

1 file changed

+280
-0
lines changed

test.py

+280
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,280 @@
1+
time_range=15
2+
3+
import os
4+
#import netCDF4
5+
import numpy as np
6+
import matplotlib.pyplot as plt
7+
import datetime
8+
import time
9+
import calendar
10+
from helpers.time_ranges import *
11+
from second_order import *
12+
import math
13+
import lightgbm as lg
14+
from sklearn.model_selection import *
15+
from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error, r2_score, mean_poisson_deviance, mean_gamma_deviance
16+
from sklearn.metrics import roc_curve, roc_auc_score, accuracy_score, precision_score, confusion_matrix, recall_score, f1_score, auc, matthews_corrcoef
17+
18+
from sklearn.metrics import make_scorer
19+
20+
from joblib import Parallel, delayed
21+
from sklearn.tree import *
22+
23+
from helpers.fourierExtrapolation import *
24+
#from helpers.helpers import *
25+
from sklearn.ensemble import *
26+
from sklearn.linear_model import *
27+
import json
28+
import sys
29+
from sklearn.preprocessing import StandardScaler
30+
index = int(sys.argv[1])
31+
#from spark_sklearn import GridSearchCV
32+
33+
34+
from joblib import Parallel, delayed
35+
def get_filepaths(directory):
36+
file_paths = [] # List which will store all of the full filepaths.
37+
# Walk the tree.
38+
for root, directories, files in os.walk(directory):
39+
for filename in files:
40+
# Join the two strings in order to form the full filepath.
41+
filepath = os.path.join(root, filename)
42+
file_paths.append(filepath ) # Add it to the list.
43+
return file_paths # Self-explanatory.
44+
def fit_classifier(svc_rbf, param, metric, X_train, y_train, X_test, y_test):
45+
clas = svc_rbf.set_params(**param)
46+
clas.fit(X_train, y_train)
47+
#return param, roc_auc_score(y_test, [a[1] for a in clas.predict_proba(X_test)] )
48+
return param, metric(y_test, clas.predict(X_test) )
49+
def gridsearch(X, y, svc_rbf, skf, metric=roc_auc_score, param_space= {}, n_jobs=-1):
50+
import json
51+
my_hash = {}
52+
i=0
53+
#for train, test in skf.split(X, y):
54+
param_score = []
55+
#with parallel_backend('spark'):
56+
param_score = Parallel(n_jobs=n_jobs)(delayed(fit_classifier)(svc_rbf, param, metric, X[train], y[train], X[test], y[test] ) for param in ParameterGrid(param_space) for train, test in skf.split(X, y))
57+
#best_param, best_score = max(param_score, key=lambda x: x[1])
58+
#print('best param this generation is {} with score {}.'.format(best_param, best_score))
59+
for my_val in param_score:
60+
#key = [ (v, k) for k, v in my_val[0].iteritems ]
61+
key = json.dumps(my_val[0] )
62+
if key not in my_hash.keys() or my_hash[key]==None:
63+
my_hash[key ] = my_val[1]
64+
else:
65+
my_hash[key ] = my_hash[key ] + my_val[1]
66+
#best_param, best_score = max(param_score, key=lambda x: x[1])
67+
#print('Best scoring param is {} with score {}.'.format(best_param, best_score))
68+
#i+=1
69+
i = skf.get_n_splits()
70+
param_scores = []
71+
for k, v in my_hash.items():
72+
param_scores.append( (k, float(v/i) ) )
73+
best_param, best_score = max(param_scores, key=lambda x: x[1])
74+
print('Best scoring param is {} with score {}.'.format(best_param, best_score))
75+
return best_param
76+
77+
import glob
78+
def get_test(clf, X, y, X_test):
79+
return [ a for a in clf.fit(X,y).predict(X_test) ]
80+
def scatter_index(s, o):
81+
s_m = [a - np.mean(s) for a in s]
82+
o_m = [a-np.mean(o) for a in o]
83+
return math.sqrt(s_m-o_m)^2 / np.sum([a^2 for a in o])
84+
85+
full_file_paths = get_filepaths('./dataset/')
86+
87+
import dask
88+
#dask.config.set(scheduler='processes')
89+
from joblib import parallel_backend
90+
from distributed import Client, progress, LocalCluster
91+
92+
93+
def my_custom_loss_func(ground_truth, predictions):
94+
#diff = np.abs(ground_truth - predictions).max()
95+
#return np.log(1 + diff)
96+
return np.corrcoef(ground_truth, predictions)[0,1]
97+
98+
my_scorer = make_scorer(my_custom_loss_func, greater_is_better=True)
99+
def run():
100+
global time_range
101+
#from helpers import helpers
102+
#from spark_sklearn import GridSearchCV
103+
for day in range(13, time_range):
104+
days = day
105+
print("------")
106+
print("day: " + str(days) )
107+
X = []
108+
y = []
109+
trends = []
110+
for fil in full_file_paths:
111+
#X, y = [], []
112+
X_temp = []
113+
y_temp = []
114+
count = 0
115+
with open(fil, "r") as f:
116+
#print(fil)
117+
lines = f.readlines()[1:]
118+
#if len(lines)>0:
119+
for line in lines:
120+
#print(line)
121+
val = line.strip().split(",")
122+
x = [float(val[a]) for a in range(len(val )- 1 ) ] #last one is date
123+
x.extend([float(a) for a in val[-1].split(" ")[0].split("/") ])
124+
x.extend([ float(str(a)[0] + "." + str(a)[1]) for a in val[-1].split(" ")[1].split(":") ])
125+
y_ = [float(val[a]) for a in range(len(val ) ) if a==index ]
126+
#if np.nan not in x and np.nan not in y:
127+
X_temp.append(x)
128+
y_temp.append(y_[0] )
129+
130+
x_val = X_temp
131+
y_val = y_temp
132+
#x_val = X_temp[:len(X_temp) - int(days*24*2)]
133+
#y_val = y_temp[int(days*24*2):]
134+
trend = [0 for a in y_val ]
135+
#print(len(x_val))
136+
#print(len(y_val))
137+
#print( len(y_val))
138+
if len(y_val) == 0: # file with no element
139+
continue
140+
x_val, y_val, trend = difference(x_val, y_val, int(days*24*2) )
141+
for a in range(0, len(x_val)): #remove first val coz trend=y[i+1]-y[i]. so no last val
142+
#print(x_val)
143+
if not np.isnan(x_val[a]).any() and not np.isnan(y_val[a]):# and y_val[a] < 20:
144+
#if all(a>-1000 for a in x_val[a]) and all(a<1000 for a in x_val[a]):
145+
X.append(x_val[a])
146+
y.append(y_val[a])
147+
trends.append(trend[a])
148+
#X.extend(X_temp[:len(X_temp) - days*24*2])
149+
#y.extend([a[0] for a in y_temp[days*24*2:] ] )
150+
151+
#timators':[ 100, ], y = X[:50000], y[:50000]
152+
print(len(X))
153+
#print(trends[:10])
154+
trends = np.array(trends)
155+
indices = [a for a in range(len(y)) ]
156+
print("mean: " + str(np.mean(y)) + " std: " + str(np.std(y)) + " range: " + str([min(y), max(y) ]) )
157+
X_train, X_test, y_train, y_test, idx1, idx2 = train_test_split(X, y, indices, test_size=0.3, random_state=27)
158+
159+
X_train, y_train, X_test, y_test = np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test)
160+
tr_trend = np.array(trends[idx1] )
161+
te_trend =np.array( trends[idx2] )
162+
scaler = StandardScaler()
163+
X_train = scaler.fit_transform(X_train)
164+
X_test = scaler.transform(X_test)
165+
print("train: " + str(len(X_train)) + ", test: " + str(len(y_test) ) )
166+
grid1 = lg.LGBMRegressor(n_estimators=100, max_depth=None, random_state=100, n_jobs=-1)
167+
grid2 = ExtraTreesRegressor(random_state=100, max_depth=None, n_jobs=1)
168+
grid3 = DecisionTreeRegressor()
169+
grids = [('lgb', grid1),('et', grid2), ]#('dt', grid3) ]
170+
param2 = {'n_estimators':[ 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],# 170, 220, 260, 350, 450, 550, 650, 700, 800 ],
171+
'min_samples_split':[2, 5, 10, 15, 25, 35, 45, 55, 65],
172+
'min_samples_leaf':[100, 150, 200, 250, 300, 350, 400, 450, 500],
173+
}
174+
param1 = { 'n_estimators':[ 100, 200, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200], # 130, 200, 320, 370, 470, 540, 600, 700, ],
175+
'num_leaves':[ 100, 150, 200, 250, 300, 350, 400, 450, 500, 750, 1000, ], #5, 10, 15, 20, 30, 40, 50, 60, 70, 90, 120, 130],
176+
'min_child_samples':[ 5, 10, 20, 30, 50, 75, 100, 150, 200,],
177+
}
178+
param3 = { 'n_estimators':[ 100, ],
179+
'num_leaves':[5, 10, 15, 20, 30, 40, 50, 60, 70],
180+
'min_child_samples':[ 5, 15, 25, 30, 35, 40, ],
181+
}
182+
183+
c_range = np.linspace(-5,15,num=25)
184+
C_range = [math.pow(2,i) for i in c_range]
185+
#param3 = {}#{'alpha': C_range, }
186+
187+
params = [param1, param2, param3]
188+
for gr in range(len(grids)):
189+
print("classifier: " + grids[gr][0])
190+
grid = grids[gr][1]
191+
#my_f = BlockingTimeSeriesSplit(n_splits=3)
192+
my_f = KFold(n_splits=3)
193+
X, y = np.array(X_train), np.array(y_train)
194+
y_ori = y.copy()
195+
param = {}
196+
#with parallel_backend('spark'):
197+
#param = gridsearch(X, y, grid, my_f, metric=r2_score, param_space= params[gr], n_jobs=-1 )
198+
#param = json.loads(param)
199+
grid = grid.set_params(**param)
200+
grid = GridSearchCV( grid, params[gr], scoring='neg_mean_squared_error', n_jobs=-1, cv=3)
201+
#with parallel_backend('spark'):
202+
grid.fit(X, y )
203+
#print("best score: "+ str(grid.best_score_) + " best params: "+ str(grid.best_params_) )
204+
if grids[gr][0] != 'linear regression':
205+
param = {'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900],}# 'n_jobs':[-1]}
206+
print("best score: "+ str(grid.best_score_) + " best params: "+ str(grid.best_params_) )
207+
grid = grid.best_estimator_
208+
#if grids[gr][0] != 'et':
209+
# param = {'n_estimators': [100, 150, 200, 250, 300, 350, 400, 450, 500, ], }
210+
# grid = GridSearchCV( grid, param, scoring='r2', n_jobs=1, cv=3 )
211+
#else:
212+
#param = {'n_estimators': 500, 'n_jobs': -1}
213+
grid.set_params(**param)
214+
#else:
215+
# print("best score: "+ str(grid.best_score_) + " best params: "+ str(grid.best_params_) )
216+
predicted_1 = Parallel(n_jobs=-1)(delayed(get_test)(grid, X_train[train], y_train[train], X_train[test] ) for train,test in my_f.split(X_train, y_train) )
217+
predicted_1 = [item for sublist in predicted_1 for item in sublist]
218+
219+
y_selected = []
220+
y_ori = [y_train[a] +tr_trend[a] for a in range(len(y_train)) ]
221+
y_ori = np.array(y_ori)
222+
predict_ = []
223+
224+
for tr, te in my_f.split(X_train, y_train):
225+
y_tes = y_ori[te]
226+
y_selected.extend(y_tes)
227+
predict_.extend([ predicted_1[a]+ tr_trend[a] for a in te ])
228+
y_ = y_selected
229+
predicted_1 = predict_
230+
231+
#predicted_1 = cross_val_predict(grid, X, y, cv=5, n_jobs=-1)
232+
var = explained_variance_score(y_, predicted_1)
233+
abs_err = mean_absolute_error(y_, predicted_1)
234+
sq_err = mean_squared_error(y_, predicted_1)
235+
#r2 = r2_score(y, predicted_1)
236+
r2 = 1-(1-r2_score(y_, predicted_1))*((len(X_train)-1)/(len(X_train)-len(X_train[0])-1))
237+
mean_y, mean_pred = np.mean(y_), np.mean(predicted_1)
238+
#si = scatter_index(predicted_1, y) #observation put on end
239+
si = math.sqrt(np.mean([ ( -y_[a] + mean_y - mean_pred + predicted_1[a])**2 for a in range(len(y_)) ]) )/np.mean(y_)
240+
nse = 1 - sum([(predicted_1[a] -y[a])**2 for a in range(len(y_)) ])/sum([ (y_[a]-mean_y)**2 for a in range(len(y_)) ])
241+
bias = np.mean([y_[a]-predicted_1[a] for a in range(len(y_)) ])
242+
hh = math.sqrt(sum([(y_[a]-predicted_1[a])**2 for a in range(len(y_)) ])/sum([y_[a]*predicted_1[a] for a in range(len(y_)) ]) )
243+
print("training")
244+
#print("mean: " + str(np.mean(y)) + " std: " + str(np.std(y)) + " range: " + str([min(y), max(y) ]) )
245+
print("sq_err: " + str(math.sqrt(sq_err)) + " abs_err: " + str(abs_err) + ' var: ' +
246+
str(var) + ' r2: ' + str(r2) + " si: " + str(si) + ' nse: ' + str(nse) +
247+
' cc: ' + str(np.corrcoef(y_, predicted_1)[0,1] ) + ' bias: ' + str(bias) +
248+
' hh: ' + str(hh) )
249+
250+
print("test")
251+
param= {'n_jobs':-1}
252+
grid.set_params(**param)
253+
#with parallel_backend('spark'):
254+
grid.fit(X_train, y_train)
255+
predicted_1 = grid.predict(X_test)
256+
y_t = [y_test[a] +te_trend[a] for a in range(len(y_test)) ]
257+
predicted_1 = [predicted_1[a]+ te_trend[a] for a in range(len(predicted_1)) ]
258+
var = explained_variance_score(y_t, predicted_1)
259+
abs_err = mean_absolute_error(y_t, predicted_1)
260+
sq_err = mean_squared_error(y_t, predicted_1)
261+
#r2_score = r2_score(y_, predicted_1)
262+
r2 = 1-(1-r2_score(y_t, predicted_1))*((len(X_test)-1)/(len(X_test)-len(X_test[0])-1))
263+
mean_y, mean_pred = np.mean(y_t), np.mean(predicted_1)
264+
#si = math.sqrt(sq_err)/np.mean(predicted_1)
265+
#si = math.sqrt(sum([ ( -y_[a] + mean_y- mean_pred + predicted_1[a])**2 ]) /sum([b**2 for b in y_]) )
266+
si = math.sqrt(np.mean([ ( -y_t[a] + mean_y - mean_pred + predicted_1[a])**2 for a in range(len(y_t)) ] ) )/np.mean(y_t)
267+
nse = 1 - sum([(predicted_1[a] -y_t[a])**2 for a in range(len(y_t)) ])/sum([ (y[a]-mean_y)**2 for a in range(len(y_t)) ])
268+
bias = np.mean([y_t[a]-predicted_1[a] for a in range(len(y_t)) ])
269+
hh = math.sqrt(sum([(y_t[a]-predicted_1[a])**2 for a in range(len(y_t)) ])/sum([y_[a]*predicted_1[a] for a in range(len(y_t)) ]) )
270+
print("sq_err: " + str(math.sqrt(sq_err)) + " abs_err: " + str(abs_err) + ' var: ' +
271+
str(var) + ' r2: ' + str(r2) + " si: " + str(si) + ' nse: ' + str(nse) +
272+
' cc: ' + str(np.corrcoef(y_t, predicted_1)[0,1] ) + ' bias: ' + str(bias) +
273+
' hh: ' + str(hh) )
274+
275+
276+
if __name__ == "__main__":
277+
#with parallel_backend('spark'):
278+
run()
279+
280+

0 commit comments

Comments
 (0)