-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutil.py
executable file
·329 lines (278 loc) · 11.2 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
# -*- coding: utf-8 -*-
import os
import pickle
from sklearn.metrics import precision_score, recall_score,f1_score,roc_curve,roc_auc_score
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from torch import nn
import torch.nn.functional as F
import torch
from matplotlib import pyplot as plt
prefix = "processed"
def save_z(z, filename='z'):
"""
save the sampled z in a txt file
"""
for i in range(0, z.shape[1], 20):
with open(filename + '_' + str(i) + '.txt', 'w') as file:
for j in range(0, z.shape[0]):
for k in range(0, z.shape[2]):
file.write('%f ' % (z[j][i][k]))
file.write('\n')
i = z.shape[1] - 1
with open(filename + '_' + str(i) + '.txt', 'w') as file:
for j in range(0, z.shape[0]):
for k in range(0, z.shape[2]):
file.write('%f ' % (z[j][i][k]))
file.write('\n')
def get_data_dim(dataset):
if dataset == 'SMAP':
return 25
elif dataset == 'MSL':
return 55
elif str(dataset).startswith('machine'):
return 38
elif dataset == "SWaT":
return 51
else:
raise ValueError('unknown dataset '+str(dataset))
def get_data(dataset, max_train_size=None, max_test_size=None, print_log=True, do_preprocess=True, train_start=0,
test_start=0):
"""
get data from pkl files
return shape: (([train_size, x_dim], [train_size] or None), ([test_size, x_dim], [test_size]))
"""
if max_train_size is None:
train_end = None
else:
train_end = train_start + max_train_size
if max_test_size is None:
test_end = None
else:
test_end = test_start + max_test_size
print('load data of:', dataset)
print("train: ", train_start, train_end)
print("test: ", test_start, test_end)
x_dim = get_data_dim(dataset)
f = open(os.path.join(prefix, dataset + '_train.pkl'), "rb")
train_data = pickle.load(f).reshape((-1, x_dim))[train_start:train_end, :]
f.close()
try:
f = open(os.path.join(prefix, dataset + '_test.pkl'), "rb")
test_data = pickle.load(f).reshape((-1, x_dim))[test_start:test_end, :]
f.close()
except (KeyError, FileNotFoundError):
test_data = None
try:
f = open(os.path.join(prefix, dataset + "_test_label.pkl"), "rb")
test_label = pickle.load(f).reshape((-1))[test_start:test_end]
f.close()
except (KeyError, FileNotFoundError):
test_label = None
if do_preprocess:
train_data = preprocess(train_data)
test_data = preprocess(test_data)
print("train set shape: ", train_data.shape)
print("test set shape: ", test_data.shape)
print("test set label shape: ", test_label.shape)
return (train_data, None), (test_data, test_label)
def preprocess(df):
"""returns normalized and standardized data.
"""
df = np.asarray(df, dtype=np.float32)
if len(df.shape) == 1:
raise ValueError('Data must be a 2-D array')
if np.any(sum(np.isnan(df)) != 0):
print('Data contains null values. Will be replaced with 0')
df = np.nan_to_num()
# normalize data
df = MinMaxScaler().fit_transform(df)
print('Data normalized')
return df
def minibatch_slices_iterator(length, batch_size,
ignore_incomplete_batch=False):
"""
Iterate through all the mini-batch slices.
Args:
length (int): Total length of data in an epoch.
batch_size (int): Size of each mini-batch.
ignore_incomplete_batch (bool): If :obj:`True`, discard the final
batch if it contains less than `batch_size` number of items.
(default :obj:`False`)
Yields
slice: Slices of each mini-batch. The last mini-batch may contain
less indices than `batch_size`.
"""
start = 0
stop1 = (length // batch_size) * batch_size
while start < stop1:
yield slice(start, start + batch_size, 1)
start += batch_size
if not ignore_incomplete_batch and start < length:
yield slice(start, length, 1)
class BatchSlidingWindow(object):
"""
Class for obtaining mini-batch iterators of sliding windows.
Each mini-batch will have `batch_size` windows. If the final batch
contains less than `batch_size` windows, it will be discarded if
`ignore_incomplete_batch` is :obj:`True`.
Args:
array_size (int): Size of the arrays to be iterated.
window_size (int): The size of the windows.
batch_size (int): Size of each mini-batch.
excludes (np.ndarray): 1-D `bool` array, indicators of whether
or not to totally exclude a point. If a point is excluded,
any window which contains that point is excluded.
(default :obj:`None`, no point is totally excluded)
shuffle (bool): If :obj:`True`, the windows will be iterated in
shuffled order. (default :obj:`False`)
ignore_incomplete_batch (bool): If :obj:`True`, discard the final
batch if it contains less than `batch_size` number of windows.
(default :obj:`False`)
"""
def __init__(self, array_size, window_size, batch_size, excludes=None,
shuffle=False, ignore_incomplete_batch=False):
# check the parameters
if window_size < 1:
raise ValueError('`window_size` must be at least 1')
if array_size < window_size:
raise ValueError('`array_size` must be at least as large as '
'`window_size`')
if excludes is not None:
excludes = np.asarray(excludes, dtype=np.bool)
expected_shape = (array_size,)
if excludes.shape != expected_shape:
raise ValueError('The shape of `excludes` is expected to be '
'{}, but got {}'.
format(expected_shape, excludes.shape))
# compute which points are not excluded
if excludes is not None:
mask = np.logical_not(excludes)
else:
mask = np.ones([array_size], dtype=np.bool)
mask[: window_size - 1] = False
where_excludes = np.where(excludes)[0]
for k in range(1, window_size):
also_excludes = where_excludes + k
also_excludes = also_excludes[also_excludes < array_size]
mask[also_excludes] = False
# generate the indices of window endings
indices = np.arange(array_size)[mask]
self._indices = indices.reshape([-1, 1])
# the offset array to generate the windows
self._offsets = np.arange(-window_size + 1, 1) # [-4,-3,-2,-1,0]
# memorize arguments
self._array_size = array_size
self._window_size = window_size
self._batch_size = batch_size
self._shuffle = shuffle
self._ignore_incomplete_batch = ignore_incomplete_batch
def get_iterator(self, arrays):
"""
Iterate through the sliding windows of each array in `arrays`.
This method is not re-entrant, i.e., calling :meth:`get_iterator`
would invalidate any previous obtained iterator.
Args:
arrays (Iterable[np.ndarray]): 1-D arrays to be iterated.
Yields:
tuple[np.ndarray]: The windows of arrays of each mini-batch.
"""
# check the parameters
arrays = tuple(np.asarray(a) for a in arrays)
if not arrays:
raise ValueError('`arrays` must not be empty')
# shuffle if required
if self._shuffle:
np.random.shuffle(self._indices)
# iterate through the mini-batches
for s in minibatch_slices_iterator(
length=len(self._indices),
batch_size=self._batch_size,
ignore_incomplete_batch=self._ignore_incomplete_batch):
idx = self._indices[s] + self._offsets
yield tuple(a[idx] if len(a.shape) == 1 else a[idx, :] for a in arrays)
class Loss(nn.Module):
def __init__(self, mode="AE1"):
super(Loss,self).__init__()
self.mode = mode
assert(self.mode == "AE1" or self.mode == "AE2")
def forward(self, x, y, z, n, reduction="mean"):
if self.mode == "AE1":
return (1/n) * torch.mean((x-y)**2) +\
(-1/n + 1) * torch.mean((x-z)**2)
elif self.mode == "AE2":
return (1 / n) * torch.mean((x-y)**2) - \
(-1 / n + 1) * torch.mean((x-z)**2)
def metrics(y, y_hat):
assert(y.shape[0] == y_hat.shape[0])
return precision_score(y, y_hat), recall_score(y, y_hat), f1_score(y, y_hat)
def test_batch(X_test, batche_size = 250, window_length=5):
X_test = preprocess(X_test)
n_test = len(X_test)
X_test = np.asarray(np.split(X_test[0:(n_test//window_length) * window_length], n_test//window_length, axis=0))
idx = np.arange(len(X_test))
idx_ = idx
for batch_idx in np.array_split(idx, len(idx)//batche_size):
yield X_test[batch_idx]
def ROC(y, y_hat):
fpr, tpr, tr = roc_curve(y, y_hat)
auc = roc_auc_score(y, y_hat)
idx = np.argwhere(np.diff(np.sign(tpr-(1-fpr)))).flatten()
return tr[idx]
def test_window(y_test, window_length):
n_test = len(y_test)
y_test_ = np.asarray(np.split(y_test[0:(n_test//window_length) * window_length], n_test//window_length, axis=0))
y_test_ = np.sum(y_test_, axis=1).flatten()
y_test_ = np.where(y_test_ >= 1, 1, 0)
return y_test_
def histogram(y_test, y_hat):
plt.figure(figsize=(12, 6))
plt.hist([y_hat[y_test == 0], y_hat[y_test == 1]], bins=20, color=["g","r"], stacked=True)
plt.title("Results", size=20)
plt.grid
plt.show()
def adjust_predicts(score, label,
threshold=None,
pred=None,
calc_latency=False):
"""
Calculate adjusted predict labels using given `score`, `threshold` (or given `pred`) and `label`.
Args:
score (np.ndarray): The anomaly score
label (np.ndarray): The ground-truth label
threshold (float): The threshold of anomaly score.
A point is labeled as "anomaly" if its score is lower than the threshold.
pred (np.ndarray or None): if not None, adjust `pred` and ignore `score` and `threshold`,
calc_latency (bool):
Returns:
np.ndarray: predict labels
"""
if len(score) != len(label):
raise ValueError("score and label must have the same length")
latency = 0
if pred is None:
predict = score >= threshold
else:
predict = pred
actual = label
anomaly_state = False
anomaly_count = 0
for i in range(len(score)):
if actual[i] and predict[i] and not anomaly_state:
anomaly_state = True
anomaly_count += 1
for j in range(i, 0, -1):
if not actual[j]:
break
else:
if not predict[j]:
predict[j] = True
latency += 1
elif not actual[i]:
anomaly_state = False
if anomaly_state:
predict[i] = True
if calc_latency:
return predict, latency / (anomaly_count + 1e-4)
else:
return predict