Initial commit (a3f77373) · 提交 · AIOps-NanKai / model / LogAnomaly

.DS_Store

0 → 100644

+10.0 KB

添加文件。

此文件类型的文件没有差异预览。

查看文件

LogAnomaly_main/detect_vector_2LSTM.py

0 → 100644

+276 −0

原始行号	差异行号	差异行
		#!/usr/bin/python
		# -- coding: UTF-8 --

		from sklearn.metrics import precision_recall_fscore_support
		import numpy
		from tensorflow.keras.layers import Dense
		from tensorflow.keras.layers import Dropout
		from tensorflow.keras.layers import Input
		from tensorflow.keras.layers import concatenate
		from tensorflow.keras.models import Model
		from tensorflow.keras.layers import LSTM
		from tensorflow.keras import backend as K
		import argparse
		import os
		from template2vec import Template2Vec


		def find_newest_file(dir_path):
		''' 找到指定目录下的最新文件
		Args:
		dir_path: 目录
		Return:
		newest_file: 最新文件
		'''
		filenames = os.listdir(dir_path)
		name_ = []
		time_ = []
		for filename in filenames:
		if 'DS' not in filename and 'hdf5' in filename:
		c_time = os.path.getctime(dir_path+filename)
		name_.append(dir_path+filename)
		time_.append(c_time)
		newest_file = name_[time_.index(max(time_))]
		return newest_file


		def detect_anomaly(para):
		''' 异常检测
		Args:
		para: 参数
		Return:
		'''
		import time
		t1=time.time()

		filename = para['test_file']
		seq_length = para['seq_length']
		n_candidates = para['n_candidates'] # topn候选集
		windows_size = para['windows_size'] # 时间窗口大小
		step_size = para['step_size'] # 时间窗口的滑动步长
		onehot = para['onehot'] # 1表示统计使用onehot，0表示使用template2vec
		model_filename = para['model_filename'] # 训练的模型参数
		model_dir = para['model_dir'] # 模板数量
		template_index_map_path = para['template_index_map_path'] # 保存模板号与向量的映射关系
		result_file = para['result_file']
		template_num = para['template_num']
		label_file = para['label_file']
		template2Vec_file = para['template2Vec_file']
		tempalte_file = para['template_file']
		count_matrix_flag = para['count_matrix']
		temp2Vec = Template2Vec(template2Vec_file, tempalte_file)

		#如果没有指定model_filename, 则从weight/文件夹中找出最新生成的文件
		if model_filename == '':
		model_filename = find_newest_file(model_dir)
		print('cur_model_filename',model_filename)


		template_to_int = {}
		int_to_template = {}
		if template_num == 0:
		# 如果template_num为0，则根据模板序列文件来生成映射
		with open(template_index_map_path) as IN:
		for line in IN:
		l = line.strip().split()
		c = l[0]
		i = int(l[1])
		template_to_int[c] = i
		int_to_template[i] = c
		else:
		# 如果template_num不为0，则根据其构造映射,int从0开始，char从1开始
		template_to_int = dict((str(i+1), i) for i in range(template_num))
		int_to_template = dict((i, str(i+1)) for i in range(template_num))

		raw_text = []
		raw_time_list = []
		raw_label_list = []
		with open(filename) as line_IN:
		with open(label_file) as label_IN:
		for line, label_line in zip(line_IN, label_IN):
		l=line.strip().split()
		if l[1] != '-1' and l[1] !='0' and l[1] in template_to_int:
		raw_text.append(l[1])
		raw_label_list.append(int(label_line.strip()))

		chars = sorted(list(set(raw_text)))

		n_chars = len(raw_text)
		n_templates = len(template_to_int)
		print ("length of log sequence: ", n_chars)
		print ("# of templates: ", n_templates)

		charX = []
		label_list = []
		vectorX = []
		vectorY = []
		for i in range(0, n_chars - seq_length, 1):
		seq_in = raw_text[i:i + seq_length]
		seq_out = raw_text[i + seq_length]
		label_out = raw_label_list[i + seq_length]
		charX.append(seq_in)
		temp_list = []
		for seq in seq_in:
		if count_matrix_flag == 0:
		#不拼接，直接用template vector
		temp_list.append(list(temp2Vec.model[seq]))
		else:
		#拼接template vector和count vector
		cur_count_vector = [0 for i in range(n_templates)]
		for t in seq_in:
		cur_index = template_to_int[t]
		cur_count_vector[cur_index]+=1

		l =list(temp2Vec.model[seq])
		l.extend(cur_count_vector)
		temp_list.append(l)
		vectorX.append(temp_list)
		vectorY.append(temp2Vec.model[seq_out])
		label_list.append(label_out)
		n_patterns = len(vectorX)
		print ("# of patterns: ", n_patterns)

		if count_matrix_flag == 0:
		X = numpy.reshape(vectorX, ( -1, seq_length, temp2Vec.dimension)) #
		else:
		X = numpy.reshape(vectorX, ( -1, seq_length, temp2Vec.dimension + n_templates))
		y = numpy.reshape(vectorY,(-1,temp2Vec.dimension))

		model_vector_input = Input(shape=(X.shape[1], temp2Vec.dimension))
		model_vector_hidden = LSTM(128, input_shape=(X.shape[1], n_templates), return_sequences=False)(model_vector_input)
		model_vector_output = Dropout(0.2)(model_vector_hidden)
		model_vec = Model(model_vector_input, model_vector_output)

		model_count_input = Input(shape=(X.shape[1], n_templates))
		model_count_hidden = LSTM(128, input_shape=(X.shape[1], n_templates), return_sequences=False)(model_count_input)
		model_count_output = Dropout(0.2)(model_count_hidden)
		model_count = Model(model_count_input, model_count_output)

		concatenated = concatenate([model_vector_output, model_count_output])
		if onehot == 0:
		out = Dense(temp2Vec.dimension, activation='softmax')(concatenated)
		else:
		out = Dense(n_templates, activation='softmax')(concatenated)

		model = Model([model_vector_input, model_count_input], out)

		# 加载网络权重
		model.load_weights(model_filename)
		model.compile(loss='mse', optimizer='adam')
		if onehot ==1:
		model.compile(loss='categorical_crossentropy', optimizer='adam')

		total = 0
		anomaly_count_dir = {}
		for i in range(n_candidates):
		anomaly_count_dir[i+1] = []
		test1_time = time.time()
		for x_char,x,aim_y_vector in zip(charX, X, y):
		total += 1
		if total % 1000 ==0:
		test2_time = time.time()
		print(str(total)+'/'+str(len(X)),str( round(100*total/len(X),3) ),'% time:',(test2_time - test1_time)/60)
		test1_time = time.time()
		aim_y_char = temp2Vec.vector_to_most_similar(aim_y_vector, topn = 1)[0][0]
		if count_matrix_flag == 0:
		x = numpy.reshape(x, (1, seq_length, temp2Vec.dimension))
		else:
		x = numpy.reshape(x, (1, seq_length, temp2Vec.dimension + n_templates))
		prediction = model.predict([x[:,:,:temp2Vec.dimension],x[:,:,temp2Vec.dimension:]], verbose=0)[0] #输出一个len(tags)的向量，数值越高的列对应概率最高的类别

		#获取最相似的topn
		if onehot == 1:
		for i in range(n_candidates):
		i += 1
		top_n_index = prediction.argsort()[-i:]
		top_n_tag=[int_to_template[index] for index in top_n_index]
		if aim_y_char not in top_n_tag:
		anomaly_count_dir[i].append(1)
		else:
		anomaly_count_dir[i].append(0)

		else:
		top_n_tuple = temp2Vec.vector_to_most_similar(prediction, topn=n_candidates)
		for i in range(n_candidates):
		i += 1
		top_n =[t[0] for t in top_n_tuple[:i]]
		if aim_y_char not in top_n:
		anomaly_count_dir[i].append(1)
		else:
		anomaly_count_dir[i].append(0)

		f = open(result_file,'w')

		print('\nanomaly detection result:')
		for i in range(n_candidates):
		i += 1
		print('next tag is not in top'+str(i)+' candidates:')
		precision, recall, f1_score, _ = numpy.array(list(precision_recall_fscore_support(label_list, anomaly_count_dir[i])))[:, 1]
		print('=' * 20, 'RESULT', '=' * 20)
		tp = 0
		fp = 0
		tn = 0
		fn = 0
		for ground_truth, detected, in zip(label_list, anomaly_count_dir[i]):
		if ground_truth == 1 and detected == 1:
		tp += 1
		if ground_truth == 1 and detected ==0:
		fn += 1
		if ground_truth ==0 and detected == 0:
		tn += 1
		if ground_truth ==0 and detected == 1:
		fp += 1
		print("Precision: %.6f, Recall: %.6f, F1_score: %.6f" % (precision, recall, f1_score))
		print('tp:',tp, 'fn:',fn,'tn:',tn,'fp:',fp,'total:',tp+tn+fp+fn)
		print('=' * 20, 'RESULT', '=' * 20)
		f.writelines(str(precision)+' '+str(recall)+'\n')

		f.close()
		t2 = time.time()
		print('testing time:',(t2-t1)/60,'mins')
		print ("\nDone.")


		if __name__ == '__main__':

		parser = argparse.ArgumentParser()
		parser.add_argument('-test_file', help='test_file.', type=str, default='../../middle/bgl_log.seq')
		parser.add_argument('-seq_length', help='seq_length.', type=int, default=10)
		parser.add_argument('-n_candidates', help='n_candidates.', type=int, default=15)
		parser.add_argument('-windows_size', help='windows_size.', type=int, default=3)
		parser.add_argument('-step_size', help='step_size.', type=int, default=1)
		parser.add_argument('-model_filename', help='you can give a model file.', type=str, default='')
		parser.add_argument('-model_dir', help='model_dir.', type=str, default='../weights/vector_deeplog/')
		parser.add_argument('-template_index_map_path', help='template_index_map_path.', type=str, default='./bgl_log_template_to_int.txt')
		parser.add_argument('-onehot', help='默认为1。1表示统计使用onehot，0表示使用template2vec',type = int, default = 1)
		parser.add_argument('-result_file', help='result_file.', type=str, default='../results/bgl_log_log_pr.txt')
		parser.add_argument('-template_num', help='若为0，则根据输入文件统计，否则，根据输入确定。默认0', type=int, default=0)
		parser.add_argument('-label_file', help='label_file.', type=str, default='../../data/bgl_label')
		parser.add_argument('-count_matrix', help='默认为0。1表示统计count_matrix，0不统计',type = int, default = 0)
		parser.add_argument('-template2Vec_file', help='template2Vec_file', type=str, default='../../model/bgl_log.template_vector')
		parser.add_argument('-template_file', help='template_file', type=str, default='../../middle/bgl_log.template')

		args = parser.parse_args()

		para_detect = {
		'test_file': args.test_file,
		'seq_length':args.seq_length,
		'n_candidates': args.n_candidates,
		'windows_size': args.windows_size,
		'step_size':args.step_size,
		'model_dir': args.model_dir,
		'model_filename': args.model_filename,
		'template_index_map_path':args.template_index_map_path,
		'template_num' : args.template_num,
		'result_file':args.result_file,
		'label_file':args.label_file,
		'template2Vec_file': args.template2Vec_file,
		'template_file': args.template_file,
		'count_matrix': args.count_matrix,
		'onehot': args.onehot
		}

		detect_anomaly(para_detect)
		print('detection finish')

		K.clear_session()

LogAnomaly_main/template2vec.py

0 → 100644

+90 −0

原始行号	差异行号	差异行
		#!/usr/bin/python
		# -- coding: UTF-8 --

		import os
		from gensim.models.word2vec import Word2Vec
		import gensim
		import numpy as np


		class Template2Vec:
		def __init__(self, model_file, template_file, is_binary=False):
		""" 初始化函数

		"""
		print('reading template2vec model')
		model = gensim.models.KeyedVectors.load_word2vec_format(model_file, binary = is_binary)
		template_to_index = {} # index从0开始(用于lstm)，template(模板编号)从1开始
		index_to_template = {}

		template_num = 0 # 模板数
		with open(template_file) as fin:
		for line in fin:
		template_num += 1

		template_to_index = dict((str(i+1), i) for i in range(template_num))
		index_to_template = dict((i, str(i+1)) for i in range(template_num))

		template_matrix = []
		for i in range(template_num):
		key = str(i+1)
		template_matrix.append(model[key])
		self.template_matrix = np.mat(template_matrix)

		vector_template_tuple =[(model[key], key) for key in template_to_index] # 向量与模板编号的映射关系
		self.model = model
		self.template_to_index = template_to_index
		self.index_to_template = index_to_template
		self.template_num = len(template_to_index)
		self.dimension = len(model['1'])
		self.vector_template_tuple = vector_template_tuple
		print(' Template2Vec.dimension:', self.dimension)
		print(' Template2Vec.template_num:', self.template_num)

		def word_to_most_similar(self, in_word, topn = 1):
		''' 与word最相似的
		Args:
		word
		Return:
		tuple(template_index,similarity)
		'''
		index = self.model.most_similar(positive = in_word,topn = topn)
		return index

		def vector_to_most_similar(self, in_vector, topn = 1):
		''' 与vector最相似的word
		Args:
		vector
		Return:
		与vector最相似的word，包含其本身。top1应该是vector对应的word。
		'''
		temp_dict = {}
		for t in self.vector_template_tuple:
		template_index = t[1]
		vector = t[0]
		temp_dict[template_index] = self.cos(in_vector, vector)
		sorted_final_tuple=sorted(temp_dict.items(),key=lambda asd:asd[1] ,reverse=True)
		return sorted_final_tuple[:topn]

		def cos(self, vector1, vector2):
		''' 计算两个vector的cos
		Args:
		vector
		Return：
		cos
		'''
		norm1 = np.linalg.norm(vector1)
		norm2 = np.linalg.norm(vector2)
		x = float(np.sum(vector1 * vector2)) / norm1 * norm2
		return x


		if __name__ == '__main__':
		temp2Vec_file = '../../model/bgl_log.template_vector'
		template_file = '../../middle/bgl_log.template'
		t = Template2Vec(temp2Vec_file, template_file)

		print(t.word_to_most_similar(['26'],topn = 3))
		print(t.vector_to_most_similar(t.model['26'],topn = 4))
		print(t.vector_to_most_similar(t.model['26'],topn = 1)[0][0])
		print(t.vector_to_most_similar(t.model['26'],topn = 4))

LogAnomaly_main/train_vector_2LSTM.py

0 → 100644

+191 −0

原始行号	差异行号	差异行
		#!/usr/bin/python
		# -- coding: UTF-8 --

		import numpy
		from tensorflow.keras.layers import Dense
		from tensorflow.keras.layers import Dropout
		from tensorflow.keras.layers import LSTM
		from tensorflow.keras.layers import Input
		from tensorflow.keras.layers import concatenate
		from tensorflow.keras.models import Model
		from tensorflow.keras.callbacks import ModelCheckpoint
		from tensorflow.keras.utils import to_categorical
		from tensorflow.keras import backend as K
		import argparse
		from template2vec import Template2Vec
		import time


		def create_dir(path):
		''' 创建目录
		Args:
		path: 目录
		Return:
		'''
		import os
		isExists=os.path.exists(path)
		if not isExists:
		os.makedirs(path)

		def train_model(para):
		''' 训练模型
		Args:
		para: 参数
		Return:
		'''
		t1 =time.time()
		epoch = para['epoch']
		filename = para['train_file']
		seq_length = para['seq_length']
		model_dir = para['model_dir']
		template_num = para['template_num']
		template2Vec_file = para['template2Vec_file']
		tempalte_file = para['template_file']
		count_matrix_flag = para['count_matrix']
		onehot = para['onehot']
		temp2Vec = Template2Vec(template2Vec_file, tempalte_file)


		create_dir(model_dir)
		template_index_map_path = para['template_index_map_path'] # 保存模板号与向量的映射关系
		raw_text = []
		with open(filename) as fin:
		for line in fin:
		l=line.strip().split()
		if l[1] != '-1':
		raw_text.append(l[1])
		t_read_raw_log = time.time()
		print('t_read_raw_log',(t_read_raw_log-t1)/60,'mins')

		if template_num == 0:
		# 如果template_num为0，则根据模板序列文件来生成映射
		chars = sorted(list(set(raw_text)))
		template_to_int = dict((c, i) for i, c in enumerate(chars))
		print('template_to_int', template_to_int)
		f = open(template_index_map_path,'w')
		for k in template_to_int:
		f.writelines(str(k)+' '+str(template_to_int[k])+'\n')
		f.close()
		else:
		# 如果template_num不为0，则根据其构造映射,int从0开始，char从1开始
		template_to_int = dict((str(i+1), i) for i in range(template_num))
		print('template_to_int', template_to_int)

		n_chars = len(raw_text)
		n_templates = len(template_to_int)
		print ("length of log sequence: ", n_chars)
		print ("# of templates: ", n_templates)

		dataX = []
		dataY = []
		vectorX = []
		vectorY = []
		for i in range(0, n_chars - seq_length, 1):
		seq_in = raw_text[i:i + seq_length]
		seq_out = raw_text[i + seq_length]
		dataY.append(template_to_int[seq_out])
		temp_list = []
		for seq in seq_in:
		if count_matrix_flag == 0:
		#不拼接，直接用template vector
		temp_list.append(list(temp2Vec.model[seq]))
		else:
		#拼接template vector和count vector
		cur_count_vector = [0 for i in range(n_templates)]
		for t in seq_in:
		cur_index = template_to_int[t]
		cur_count_vector[cur_index]+=1

		l =list(temp2Vec.model[seq])
		l.extend(cur_count_vector)
		temp_list.append(l)

		vectorX.append(numpy.array(temp_list))
		vectorY.append(temp2Vec.model[seq_out])

		n_patterns = len(vectorX)
		print ("# of patterns:", n_patterns)
		t_generate_vector = time.time()
		print('generateVector time:',(t_generate_vector - t_read_raw_log)/60,'mins' )

		if count_matrix_flag == 0:
		X = numpy.reshape(vectorX, ( -1, seq_length, temp2Vec.dimension))
		else:
		X = numpy.reshape(vectorX, ( -1, seq_length, temp2Vec.dimension + n_templates))
		y = numpy.reshape(vectorY,(-1,temp2Vec.dimension))
		t_reshape = time.time()
		print('t_reshape:',(t_reshape - t_generate_vector)/60,'mins' )

		if onehot ==1:
		y = to_categorical(dataY, num_classes = n_templates)
		t_tocategorical = time.time()
		print('t_tocategorical:',(t_tocategorical - t_reshape)/60,'mins' )


		model_vector_input = Input(shape=(X.shape[1], temp2Vec.dimension))
		model_vector_hidden = LSTM(128, input_shape=(X.shape[1], n_templates), return_sequences=False)(model_vector_input)
		model_vector_output = Dropout(0.2)(model_vector_hidden)
		model_vec = Model(model_vector_input, model_vector_output)

		model_count_input = Input(shape=(X.shape[1], n_templates))
		model_count_hidden = LSTM(128, input_shape=(X.shape[1], n_templates), return_sequences=False)(model_count_input)
		model_count_output = Dropout(0.2)(model_count_hidden)
		model_count = Model(model_count_input, model_count_output)

		concatenated = concatenate([model_vector_output, model_count_output])
		if onehot == 0:
		out = Dense(temp2Vec.dimension, activation='softmax')(concatenated)
		else:
		out = Dense(n_templates, activation='softmax')(concatenated)

		model = Model([model_vector_input, model_count_input], out)
		model.compile(loss='mse', optimizer='adam')
		if onehot ==1:
		model.compile(loss='categorical_crossentropy', optimizer='adam')

		if count_matrix_flag == 0:
		s = 'only_vector'
		else:
		s = 'contact_matrix'
		filepath = model_dir+"log_weights-"+ s +"-{epoch:02d}-{loss:.4f}-bigger.hdf5"
		checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
		callbacks_list = [checkpoint]

		model.fit([X[:,:,:temp2Vec.dimension], X[:,:,temp2Vec.dimension:]], y, batch_size=64, epochs=epoch, callbacks=callbacks_list)

		t2 = time.time()
		print('training time:',(t2-t1)/60,'mins')
		return n_templates


		if __name__ == '__main__':

		parser = argparse.ArgumentParser()
		parser.add_argument('-train_file', help='train_file.', type=str, default='../../middle/bgl_log.seq')
		parser.add_argument('-seq_length', help='seq_length.', type=int, default=10)
		parser.add_argument('-model_dir', help='网络参数的输出文件夹', type=str, default='../weights/vector_deeplog/')
		parser.add_argument('-template_num', help='若为0，则根据输入文件统计，否则，根据输入确定。默认0', type=int, default=0)
		parser.add_argument('-template2Vec_file', help='template2Vec_file', type=str, default='../../model/bgl_log.template_vector')
		parser.add_argument('-count_matrix', help='默认为0。1表示统计count_matrix，0不统计',type = int, default = 0)
		parser.add_argument('-onehot', help='默认为1。1表示统计使用onehot，0表示使用template2vec',type = int, default = 1)
		parser.add_argument('-template_file', help='template_file', type=str, default='../../middle/bgl_log.template')
		parser.add_argument('-epoch', help='epoch', type=int, default=30)
		args = parser.parse_args()

		para_train = {
		'train_file': args.train_file,
		'seq_length':args.seq_length,
		'model_dir': args.model_dir,
		'template_index_map_path':args.train_file+'_map',
		'template_num': args.template_num,
		'template2Vec_file': args.template2Vec_file,
		'template_file': args.template_file,
		'count_matrix': args.count_matrix,
		'onehot': args.onehot,
		'epoch': args.epoch
		}

		n_templates = train_model(para_train)

		K.clear_session()
		print('training has finished')

README.md

0 → 100644

+0 −0

添加文件。

预览已超出大小限制，变更已折叠。