Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
8 changes: 4 additions & 4 deletions anomalydetection/robust/bi_lstm_att_predict.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def generate_robust_seq_label(file_path, sequence_length, pattern_vec_file):
train_file = pd.read_csv(file_path)
for i in range(len(train_file)):
num_of_sessions += 1
line = [int(id) for id in train_file["Sequence"][i].split(' ')]
line = [int(id) for id in train_file["Sequence"][i].split(' ')[:-1]]
line = line[0:sequence_length]
if len(line) < sequence_length:
line.extend(list([0]) * (sequence_length - len(line)))
Expand All @@ -60,7 +60,7 @@ def generate_robust_seq_label(file_path, sequence_length, pattern_vec_file):
if event == 0:
semantic_line.append([-1] * 300)
else:
semantic_line.append(class_type_to_vec[str(event - 1)])
semantic_line.append(class_type_to_vec[str(event)])
input_data.append(semantic_line)
output_data.append(int(train_file["label"][i]))
data_set = TensorDataset(torch.tensor(input_data, dtype=torch.float), torch.tensor(output_data))
Expand Down Expand Up @@ -88,8 +88,8 @@ def do_predict(input_size, hidden_size, num_layers, num_classes, sequence_length
# first traverse [0, window_size)
seq = seq.view(-1, sequence_length, input_size).to(device)
#label = torch.tensor(label).view(-1).to(device)
output = sequential_model(seq)[:, 0].clone().detach().numpy()
predicted = (output > 0.2).astype(int)
output = sequential_model(seq)[:, 0].cpu().clone().detach().numpy()
predicted = (output > 0.05).astype(int)
label = np.array([y for y in label])
TP += ((predicted == 1) * (label == 1)).sum()
FP += ((predicted == 1) * (label == 0)).sum()
Expand Down
5 changes: 3 additions & 2 deletions anomalydetection/robust/bi_lstm_att_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,8 @@ def generate_robust_seq_label(file_path, sequence_length, pattern_vec_file):
train_file = pd.read_csv(file_path)
for i in range(len(train_file)):
num_of_sessions += 1
line = [int(id) for id in train_file["Sequence"][i].split(' ')]
k=train_file["Sequence"][i].split(' ')[:-1]
line = [int(id) for id in k]
line = line[0:sequence_length]
if len(line) < sequence_length:
line.extend(list([0]) * (sequence_length - len(line)))
Expand All @@ -112,7 +113,7 @@ def generate_robust_seq_label(file_path, sequence_length, pattern_vec_file):
if event == 0:
semantic_line.append([-1] * 300)
else:
semantic_line.append(class_type_to_vec[str(event - 1)])
semantic_line.append(class_type_to_vec[str(event)])
input_data.append(semantic_line)
output_data.append(int(train_file["label"][i]))
data_set = TensorDataset(torch.tensor(input_data, dtype=torch.float), torch.tensor(output_data))
Expand Down
6 changes: 6 additions & 0 deletions log_deep_data_anomaly.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@
from anomalydetection.self_att_lstm import self_att_lstm_train
from anomalydetection.self_att_lstm import self_att_lstm_predict


log_file='./Data/log/hdfs/HDFS_split'
log_file_label='./Data/log/hdfs/HDFS_split_anomaly'

sequential_directory = './Data/logdeepdata/'
train_file_name = 'hdfs_train'
test_abnormal_name = 'hdfs_test_abnormal'
Expand Down Expand Up @@ -42,6 +46,8 @@





def train_model():
#log_anomaly_sequential_train.train_model(window_length, input_size, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, train_root_path, model_out_path, data_file, pattern_vec_file)
self_att_lstm_train.train_model(window_length, input_size, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, train_root_path, model_out_path, data_file, pattern_vec_file)
Expand Down
Binary file modified logparsing/fttree/__pycache__/__init__.cpython-37.pyc
Binary file not shown.
Binary file modified logparsing/fttree/__pycache__/fttree.cpython-37.pyc
Binary file not shown.
198 changes: 153 additions & 45 deletions robust_anomaly_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,67 +7,185 @@
from anomalydetection.loganomaly import log_anomaly_sequential_predict
from anomalydetection.robust import bi_lstm_att_train
from anomalydetection.robust import bi_lstm_att_predict
import os
import re
import numpy as np
import pandas as pd
from collections import OrderedDict
import json
log_file='./Data/log/hdfs/HDFS_split'
log_file_label='./Data/log/hdfs/HDFS_split_anomaly'

# parameters for early prepare
'''log_file_dir = './Data/log/hdfs/'
log_file_name = 'HDFS_split'
log_fttree_out_directory = './Data/FTTreeResult-HDFS/clusters/'
# anomaly file name used which is also used in ./Data/log/file_split
anomaly_line_file = './Data/log/hdfs/HDFs_split_anomaly'
wordvec_file_path = './Data/pretrainedwordvec/crawl-300d-2M.vec(0.1M)'
sequential_directory = './Data/FTTreeResult-HDFS/sequential_files/'
train_file_name = 'train_file'
test_file_name = 'test_file'
label_file_name = 'label_file'
pattern_vec_out_path = './Data/FTTreeResult-HDFS/pattern_vec'''
clusters_files='./Data/FTTreeResult-HDFS/clusters/'

temp_directory = './Data/logdeepdata/'
train_file_name = 'robust_log_train.csv'
test_file_name = 'robust_log_test.csv'
valid_file_name = 'robust_log_valid.csv'

# log anomaly sequential model parameters some parameter maybe changed to train similar models
sequence_length = 50
sequence_length = 10
input_size = 300
hidden_size = 128
num_of_layers = 2
# 1 using sigmoid, 2 using softmax
num_of_classes = 1
num_epochs = 20
batch_size = 1000
num_epochs = 50
batch_size = 500
# for robust attention bi
train_root_path = './Data/FTTreeResult-HDFS/robust_att_bi_model_train/'
model_out_path = train_root_path + 'model_out/'
train_file = temp_directory + train_file_name
pattern_vec_json = './Data/logdeepdata/event2semantic_vec.json'


# predict parameters
# log anomaly sequential model parameters

'''if not os.path.exists(log_fttree_out_directory):
os.makedirs(log_fttree_out_directory)
if not os.path.exists(sequential_directory):
os.makedirs(sequential_directory)'''
if not os.path.exists(train_root_path):
os.makedirs(train_root_path)


'''def pattern_extract():
fttree.pattern_extract(log_file_dir, log_file_name, log_fttree_out_directory, 5, 4, 2)

# 同时生成train file 和 test file好点
def extract_feature():
hdfs_ft_preprocessor.preprocessor_hdfs_ft(log_fttree_out_directory, anomaly_line_file, wordvec_file_path, sequential_directory, train_file_name, test_file_name, label_file_name, pattern_vec_out_path, split_degree, log_line_num)
def generate_train_and_test_file():

# #将聚类结果读取到clusters_result字典中
files= os.listdir(clusters_files)
clusters_result={}
for file in files: #遍历文件夹
if not os.path.isdir(file): #判断是否是文件夹,不是文件夹才打开
f = open(clusters_files+"/"+file); #打开文件
f.readline()
linenums=f.readline()
linenums=linenums.split(" ")
clusters_result[file]=linenums


with open(log_file,'r')as hdfs_file:
num=sequence_length
for i in range(110000):
log_line=hdfs_file.readline()
#找这个日志是那个聚类里
log_key=0
for key, value in clusters_result.items():
if str(i) in value:
log_key=key
break
if(num>0):
with open('test10','a+') as t:
t.write(str(log_key)+" ")
num=num-1
else:
num=sequence_length
with open('test10','a+') as t:
t.write("\n"+str(log_key)+" ")
num=num-1



#生成训练文件


anamaly_label=''
with open(log_file_label,'r')as label:
anamaly_label=label.readline().split()
train_log=[]
test_log=[]
valid_log=[]
anamaly_num=0
log_line_num=0
with open("test10",'r') as f:
for k in range(2000):
label=0
origin_line=f.readline()[:-1]

for i in anamaly_label:
i=int(i)
if(i<sequence_length*(k+1) and i>sequence_length*k ): #0到50
label=1
anamaly_num+=1
break
a=[]
a.append(origin_line)
a.append(label)
train_log.append(a)
# train_log[origin_line]=label
data_df = pd.DataFrame(data=train_log, columns=['Sequence', 'label'])
data_df.to_csv("./Data/logdeepdata/robust_log_train.csv",index=None)
print("训练集中异常有",anamaly_num)

anamaly_num=0
for k in range(2000,2500):
label=0
origin_line=f.readline()[:-1]

for i in anamaly_label:
i=int(i)
if(i<sequence_length*(k+1) and i>sequence_length*k ): #0到50
label=1
anamaly_num+=1
break

a=[]
a.append(origin_line)
a.append(label)
valid_log.append(a)
# train_log[origin_line]=label
data_df = pd.DataFrame(data=valid_log, columns=['Sequence', 'label'])
data_df.to_csv("./Data/logdeepdata/robust_log_valid.csv",index=None)
print("验证集中异常有",anamaly_num)



anamaly_num=0
for k in range(2500,11000):
label=0
origin_line=f.readline()[:-1]
for i in anamaly_label:
i=int(i)
if(i<sequence_length*(k+1) and i>sequence_length*k ): #0到50
label=1
anamaly_num+=1
break
a=[]
a.append(origin_line)
a.append(label)
test_log.append(a)
# train_log[origin_line]=label
data_df = pd.DataFrame(data=test_log, columns=['Sequence', 'label'])
data_df.to_csv("./Data/logdeepdata/robust_log_test.csv",index=None)
print("测试集中异常有",anamaly_num)



# # 生成json

with open("./Data/logdeepdata/pattern_vec") as pvec:
lines=pvec.readlines()
vecs={}
for line in lines:
line=line.split("[:]")
svec=line[-1][:-1].split()
vec=[]
for s in svec:
vec.append(float(s))
#找这是第几个pattern

files= os.listdir(clusters_files)
clusters_result={}
for file in files: #遍历文件夹
if not os.path.isdir(file): #判断是否是文件夹,不是文件夹才打开
f = open(clusters_files+"/"+file); #打开文件
pattern=f.readline()[:-1]
if(pattern==line[0]):
vecs[file]=vec
break

event2semvec = json.dumps(vecs)

with open('./Data/logdeepdata/event2semantic_vec.json', 'w') as fw:
fw.write(event2semvec)



def pattern_extract_test():
fttree.pattern_extract(log_file_dir, log_file_name, log_fttree_out_directory, 5, 4, 2)


def extract_feature_test():
hdfs_ft_preprocessor.preprocessor_hdfs_ft(log_fttree_out_directory, anomaly_line_file, wordvec_file_path, sequential_directory, 'train_file')
'''

def train_model():
bi_lstm_att_train.train_model(sequence_length, input_size, hidden_size, num_of_layers, num_of_classes, num_epochs, batch_size, train_root_path, model_out_path, train_file, pattern_vec_json)
Expand All @@ -77,16 +195,6 @@ def test_model():
# do something
bi_lstm_att_predict.do_predict(input_size, hidden_size, num_of_layers, num_of_classes, sequence_length, model_out_path + 'Adam_batch_size=' + str(batch_size) + ';epoch=' + str(num_epochs) + '.pt', temp_directory + test_file_name, batch_size, pattern_vec_json)

#pattern_extract()
#extract_feature()
#train_model()
#train_model()
generate_train_and_test_file()
train_model()
test_model()

# deep log
# log_preprocessor.execute_process()
# value_extract.get_value()
# value_extract.value_deal()
# value_extract.value_extract()
# train predict

Loading