Initial commit (58629fc4) · 提交 · AIOps-NanKai / model / LogBert

..gitignore.un~

0 → 100644

+1.8 KB

添加文件。

此文件类型的文件没有差异预览。

查看文件

.gitignore

0 → 100644

+131 −0

原始行号	差异行号	差异行
		*.csv
		*.log
		.ipynb_checkpoints
		output/*

		# Editors
		.vscode/
		.idea/

		# Vagrant
		.vagrant/

		# Mac/OSX
		.DS_Store

		# Windows
		Thumbs.db

		# Source for the following rules: https://raw.githubusercontent.com/github/gitignore/master/Python.gitignore
		# Byte-compiled / optimized / DLL files
		__pycache__/
		*.py[cod]
		*$py.class

		# C extensions
		*.so

		# Distribution / packaging
		.Python
		build/
		develop-eggs/
		dist/
		downloads/
		eggs/
		.eggs/
		lib/
		lib64/
		parts/
		sdist/
		var/
		wheels/
		*.egg-info/
		.installed.cfg
		*.egg
		MANIFEST

		# PyInstaller
		# Usually these files are written by a python script from a template
		# before PyInstaller builds the exe, so as to inject date/other infos into it.
		*.manifest
		*.spec

		# Installer logs
		pip-log.txt
		pip-delete-this-directory.txt

		# Unit test / coverage reports
		htmlcov/
		.tox/
		.nox/
		.coverage
		.coverage.*
		.cache
		nosetests.xml
		coverage.xml
		*.cover
		.hypothesis/
		.pytest_cache/

		# Translations
		*.mo
		*.pot

		# Django stuff:
		*.log
		local_settings.py
		db.sqlite3

		# Flask stuff:
		instance/
		.webassets-cache

		# Scrapy stuff:
		.scrapy

		# Sphinx documentation
		docs/_build/

		# PyBuilder
		target/

		# Jupyter Notebook
		.ipynb_checkpoints

		# IPython
		profile_default/
		ipython_config.py

		# pyenv
		.python-version

		# celery beat schedule file
		celerybeat-schedule

		# SageMath parsed files
		*.sage.py

		# Environments
		.env
		.venv
		env/
		venv/
		ENV/
		env.bak/
		venv.bak/

		# Spyder project settings
		.spyderproject
		.spyproject

		# Rope project settings
		.ropeproject

		# mkdocs documentation
		/site

		# mypy
		.mypy_cache/
		.dmypy.json
		dmypy.json

BGL/baselines.ipynb

0 → 100644

+226 −0

原始行号	差异行号	差异行
		%% Cell type:code id: tags:

		``` python
		#!/usr/bin/env python
		# -- coding: utf-8 --

		import sys
		sys.path.append('../')

		import argparse
		import numpy as np
		import pandas as pd
		import random
		from importlib import reload
		from sklearn.model_selection import GridSearchCV
		from sklearn import svm
		from sklearn.utils import shuffle

		from loglizer.models import InvariantsMiner, PCA, IsolationForest, OneClassSVM, LogClustering, LR, SVM
		from loglizer import dataloader, preprocessing
		from loglizer.utils import metrics
		```

		%% Cell type:code id: tags:

		``` python
		ouput_dir = "../output/bgl/"
		middle_dir = ""
		log_file = "BGL.log"
		```

		%% Cell type:markdown id: tags:

		<!-- # Produce event templates from train test dataset -->

		%% Cell type:markdown id: tags:

		# Split train test data

		%% Cell type:code id: tags:

		``` python
		(x_train, y_train), (x_test, y_test) = dataloader.load_data(ouput_dir, middle_dir, log_file, is_mapping=True)
		```

		%% Output

		/home/haixuanguo/Documents/deeplog_copy_github/BGL/../loglizer/dataloader.py:286: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.
		train = np.array(train).reshape(-1,1)
		/home/haixuanguo/Documents/deeplog_copy_github/BGL/../loglizer/dataloader.py:292: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.
		test_normal = np.array(test_normal).reshape(-1,1)
		/home/haixuanguo/Documents/deeplog_copy_github/BGL/../loglizer/dataloader.py:298: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.
		abnormal = np.array(abnormal).reshape(-1,1)

		Train normal size: 13718
		Train abnormal size: 1207
		Total logkey(exclude 0:UNK) 1000
		Test normal size: 20579
		Test abnormal size: 1811
		num_unk_event in test data: 0

		%% Cell type:code id: tags:

		``` python
		feature_extractor = preprocessing.FeatureExtractor()
		x_train = feature_extractor.fit_transform(x_train)
		x_test = feature_extractor.transform(x_test)
		```

		%% Output

		====== Transformed train data summary ======
		Train data shape: 14925-by-832

		====== Transformed test data summary ======
		Test data shape: 22390-by-832


		%% Cell type:code id: tags:

		``` python
		%%time
		print("="20 + " Model: PCA " + "="20)
		for th in np.arange(1):
		print("theshold", th)
		model = PCA(n_components=0.8, threshold=1, c_alpha = 1.9600)
		model.fit(x_train)
		print('Train validation:')
		precision, recall, f1 = model.evaluate(x_train, y_train)
		print('Test validation:')
		precision, recall, f1 = model.evaluate(x_test, y_test)
		```

		%% Output

		==================== Model: PCA ====================
		theshold 0
		====== Model summary ======
		n_components: 5
		Project matrix shape: 832-by-832
		SPE threshold: 1

		Train validation:
		====== Evaluation summary ======
		Confusion Matrix: TP: 1193, FP: 11915, TN: 1803, FN: 14
		Precision: 9.101%, recall: 98.840%, F1-measure: 16.668%

		Test validation:
		====== Evaluation summary ======
		Confusion Matrix: TP: 1777, FP: 17824, TN: 2755, FN: 34
		Precision: 9.066%, recall: 98.123%, F1-measure: 16.598%

		CPU times: user 16.9 s, sys: 66.9 ms, total: 17 s
		Wall time: 1.73 s

		%% Cell type:code id: tags:

		``` python
		%%time
		print("="20 + " Model: IsolationForest " + "="20)
		model = IsolationForest(n_estimators=100, max_samples='auto', contamination='auto', random_state=19)
		model.fit(x_train)
		print('Train validation:')
		precision, recall, f1 = model.evaluate(x_train, y_train)
		print('Test validation:')
		precision, recall, f1 = model.evaluate(x_test, y_test)
		```

		%% Output

		==================== Model: IsolationForest ====================
		====== Model summary ======
		Train validation:
		====== Evaluation summary ======
		Confusion Matrix: TP: 174, FP: 0, TN: 13718, FN: 1033
		Precision: 100.000, recall: 14.416, F1-measure: 25.199

		Test validation:
		====== Evaluation summary ======
		Confusion Matrix: TP: 258, FP: 0, TN: 20579, FN: 1553
		Precision: 100.000, recall: 14.246, F1-measure: 24.940

		CPU times: user 17.2 s, sys: 2.96 s, total: 20.2 s
		Wall time: 18.4 s

		%% Cell type:code id: tags:

		``` python
		%%time
		print("="20 + " Model: one class SVM " + "="20)
		model = OneClassSVM(kernel='rbf')
		model.fit(x_train, y_train)

		print('Train validation:')
		precision, recall, f1 = model.evaluate(x_train, y_train)
		print('Test validation:')
		precision, recall, f1 = model.evaluate(x_test, y_test)
		```

		%% Output

		==================== Model: one class SVM ====================
		====== Model summary ======
		Train validation:
		====== Evaluation summary ======
		Confusion Matrix: TP: 152, FP: 13718, TN: 0, FN: 1055
		Precision: 1.096, recall: 12.593, F1-measure: 2.016

		Test validation:
		====== Evaluation summary ======
		Confusion Matrix: TP: 227, FP: 20579, TN: 0, FN: 1584
		Precision: 1.091, recall: 12.534, F1-measure: 2.007

		CPU times: user 6min 39s, sys: 69.4 ms, total: 6min 39s
		Wall time: 6min 39s

		%% Cell type:code id: tags:

		``` python
		%%time
		print("="20 + " Model: LogClustering " + "="20)
		max_dist = 0.3 # the threshold to stop the clustering process
		anomaly_threshold = 0.3 # the threshold for anomaly detection
		model = LogClustering(max_dist=max_dist, anomaly_threshold=anomaly_threshold)
		model.fit(x_train[y_train == 0, :]) # Use only normal samples for training
		print('Train validation:')
		precision, recall, f1 = model.evaluate(x_train, y_train)
		print('Test validation:')
		precision, recall, f1 = model.evaluate(x_test, y_test)
		```

		%% Output

		==================== Model: LogClustering ====================
		====== Model summary ======
		Starting offline clustering...
		Processed 1000 instances.
		Found 92 clusters offline.

		Starting online clustering...
		Processed 2000 instances.
		Processed 4000 instances.
		Processed 6000 instances.
		Processed 8000 instances.
		Processed 10000 instances.
		Processed 12000 instances.
		Processed 13718 instances.
		Found 172 clusters online.

		Train validation:
		====== Evaluation summary ======
		Confusion Matrix: TP: 775, FP: 1, TN: 13717, FN: 432
		Precision: 99.871, recall: 64.209, F1-measure: 78.164

		Test validation:
		====== Evaluation summary ======
		Confusion Matrix: TP: 1215, FP: 64, TN: 20515, FN: 596
		Precision: 94.996, recall: 67.090, F1-measure: 78.641

		CPU times: user 1min 42s, sys: 28.1 ms, total: 1min 42s
		Wall time: 1min 42s

		%% Cell type:code id: tags:

		``` python
		```

BGL/data_process.py

0 → 100644

+183 −0

原始行号	差异行号	差异行
		import sys
		sys.path.append('../')

		import os
		import gc
		import pandas as pd
		import numpy as np
		from logparser import Spell, Drain
		import argparse
		from tqdm import tqdm
		from logdeep.dataset.session import sliding_window

		tqdm.pandas()
		pd.options.mode.chained_assignment = None

		PAD = 0
		UNK = 1
		START = 2

		data_dir = os.path.expanduser("~/.dataset/bgl")
		output_dir = "../output/bgl/"
		log_file = "BGL.log"


		# In the first column of the log, "-" indicates non-alert messages while others are alert messages.
		def count_anomaly():
		total_size = 0
		normal_size = 0
		with open(data_dir + log_file, encoding="utf8") as f:
		for line in f:
		total_size += 1
		if line.split(' ',1)[0] == '-':
		normal_size += 1
		print("total size {}, abnormal size {}".format(total_size, total_size - normal_size))


		# def deeplog_df_transfer(df, features, target, time_index, window_size):
		# """
		# :param window_size: offset datetime https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects
		# :return:
		# """
		# agg_dict = {target:'max'}
		# for f in features:
		# agg_dict[f] = _custom_resampler
		#
		# features.append(target)
		# features.append(time_index)
		# df = df[features]
		# deeplog_df = df.set_index(time_index).resample(window_size).agg(agg_dict).reset_index()
		# return deeplog_df
		#
		#
		# def _custom_resampler(array_like):
		# return list(array_like)


		def deeplog_file_generator(filename, df, features):
		with open(filename, 'w') as f:
		for _, row in df.iterrows():
		for val in zip(*row[features]):
		f.write(','.join([str(v) for v in val]) + ' ')
		f.write('\n')


		def parse_log(input_dir, output_dir, log_file, parser_type):
		log_format = '<Label> <Id> <Date> <Code1> <Time> <Code2> <Component1> <Component2> <Level> <Content>'
		regex = [
		r'(0x)[0-9a-fA-F]+', #hexadecimal
		r'\d+.\d+.\d+.\d+',
		# r'/\w+( )$'
		r'\d+'
		]
		keep_para = False
		if parser_type == "drain":
		# the hyper parameter is set according to http://jmzhu.logpai.com/pub/pjhe_icws2017.pdf
		st = 0.3 # Similarity threshold
		depth = 3 # Depth of all leaf nodes
		parser = Drain.LogParser(log_format, indir=input_dir, outdir=output_dir, depth=depth, st=st, rex=regex, keep_para=keep_para)
		parser.parse(log_file)
		elif parser_type == "spell":
		tau = 0.55
		parser = Spell.LogParser(indir=data_dir, outdir=output_dir, log_format=log_format, tau=tau, rex=regex, keep_para=keep_para)
		parser.parse(log_file)

		#
		# def merge_list(time, activity):
		# time_activity = []
		# for i in range(len(activity)):
		# temp = []
		# assert len(time[i]) == len(activity[i])
		# for j in range(len(activity[i])):
		# temp.append(tuple([time[i][j], activity[i][j]]))
		# time_activity.append(np.array(temp))
		# return time_activity


		if __name__ == "__main__":
		#
		#
		# parser = argparse.ArgumentParser()
		# parser.add_argument('-p', default=None, type=str, help="parser type")
		# parser.add_argument('-w', default='T', type=str, help='window size(mins)')
		# parser.add_argument('-s', default='1', type=str, help='step size(mins)')
		# parser.add_argument('-r', default=0.4, type=float, help="train ratio")
		# args = parser.parse_args()
		# print(args)
		#

		##########
		# Parser #
		#########

		parse_log(data_dir, output_dir, log_file, 'drain')

		#########
		# Count #
		#########
		# count_anomaly()

		##################
		# Transformation #
		##################
		# mins
		window_size = 5
		step_size = 1
		train_ratio = 0.4

		df = pd.read_csv(f'{output_dir}{log_file}_structured.csv')

		# data preprocess
		df['datetime'] = pd.to_datetime(df['Time'], format='%Y-%m-%d-%H.%M.%S.%f')
		df["Label"] = df["Label"].apply(lambda x: int(x != "-"))
		df['timestamp'] = df["datetime"].values.astype(np.int64) // 10 ** 9
		df['deltaT'] = df['datetime'].diff() / np.timedelta64(1, 's')
		df['deltaT'].fillna(0)
		# convert time to UTC timestamp
		# df['deltaT'] = df['datetime'].apply(lambda t: (t - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s'))

		# sampling with fixed window
		# features = ["EventId", "deltaT"]
		# target = "Label"
		# deeplog_df = deeplog_df_transfer(df, features, target, "datetime", window_size=args.w)
		# deeplog_df.dropna(subset=[target], inplace=True)

		# sampling with sliding window
		deeplog_df = sliding_window(df[["timestamp", "Label", "EventId", "deltaT"]],
		para={"window_size": int(window_size)60, "step_size": int(step_size) 60}
		)

		#########
		# Train #
		#########
		df_normal =deeplog_df[deeplog_df["Label"] == 0]
		df_normal = df_normal.sample(frac=1, random_state=12).reset_index(drop=True) #shuffle
		normal_len = len(df_normal)
		train_len = int(normal_len * train_ratio)

		train = df_normal[:train_len]
		# deeplog_file_generator(os.path.join(output_dir,'train'), train, ["EventId", "deltaT"])
		deeplog_file_generator(os.path.join(output_dir,'train'), train, ["EventId"])

		print("training size {}".format(train_len))


		###############
		# Test Normal #
		###############
		test_normal = df_normal[train_len:]
		deeplog_file_generator(os.path.join(output_dir, 'test_normal'), test_normal, ["EventId"])
		print("test normal size {}".format(normal_len - train_len))

		del df_normal
		del train
		del test_normal
		gc.collect()

		#################
		# Test Abnormal #
		#################
		df_abnormal = deeplog_df[deeplog_df["Label"] == 1]
		#df_abnormal["EventId"] = df_abnormal["EventId"].progress_apply(lambda e: event_index_map[e] if event_index_map.get(e) else UNK)
		deeplog_file_generator(os.path.join(output_dir,'test_abnormal'), df_abnormal, ["EventId"])
		print('test abnormal size {}'.format(len(df_abnormal)))

BGL/deeplog.py

0 → 100644

+126 −0

原始行号	差异行号	差异行
		# -- coding: utf-8 --
		import platform
		import argparse
		import sys
		sys.path.append('../')

		from logdeep.models.lstm import *
		from logdeep.tools.predict import Predicter
		from logdeep.tools.train import Trainer
		from logdeep.tools.utils import *
		from logdeep.dataset.vocab import Vocab

		import torch

		output_dir = "../output/bgl/"

		# Config Parameters
		options = dict()
		options['output_dir'] = output_dir
		options['train_vocab'] = output_dir + 'train'
		options["vocab_path"] = output_dir + "vocab.pkl"

		options['device'] = 'cuda' if torch.cuda.is_available() else 'cpu'

		# Smaple
		options['sample'] = "sliding_window"
		options['window_size'] = 20 # if fix_window
		options['train_ratio'] = 1
		options['valid_ratio'] = 0.1
		options['test_ratio'] = 1
		options["min_len"] = 10

		options["is_logkey"] = True
		options["is_time"] = False

		# Features
		options['sequentials'] = options["is_logkey"]
		options['quantitatives'] = False
		options['semantics'] = False
		options['parameters'] = options["is_time"]
		options['feature_num'] = sum(
		[options['sequentials'], options['quantitatives'], options['semantics'], options['parameters']])

		# Model
		options['input_size'] = 1
		options['hidden_size'] = 64
		options['num_layers'] = 2
		options["embedding_dim"] = 50
		options["vocab_size"] = 200
		options['num_classes'] = options["vocab_size"]
		# Train
		options['batch_size'] = 128
		options['accumulation_step'] = 1

		options['optimizer'] = 'adam'
		options['lr'] = 0.01
		options['max_epoch'] = 200
		options["n_epochs_stop"] = 10
		options['lr_step'] = (options['max_epoch'] - 20, options['max_epoch'])
		options['lr_decay_ratio'] = 0.1

		options['resume_path'] = None
		options['model_name'] = "deeplog"
		options['save_dir'] = options["output_dir"] + "deeplog/"

		# Predict
		options['model_path'] = options["save_dir"] + "bestloss.pth"
		options['num_candidates'] = 9
		options["threshold"] = None
		options["gaussian_mean"] = 0
		options["gaussian_std"] = 0
		options["num_outputs"] = 1


		print("Features logkey:{} time: {}".format(options["is_logkey"], options["is_time"]))
		print("Device:", options['device'])

		seed_everything(seed=1234)

		Model = Deeplog(input_size=options['input_size'],
		hidden_size=options['hidden_size'],
		num_layers=options['num_layers'],
		vocab_size=options["vocab_size"],
		embedding_dim=options["embedding_dim"])


		def train():
		trainer = Trainer(Model, options)
		trainer.start_train()


		def predict():
		predicter = Predicter(Model, options)
		predicter.predict_unsupervised()


		if __name__ == "__main__":
		parser = argparse.ArgumentParser()
		subparsers = parser.add_subparsers()

		train_parser = subparsers.add_parser('train')
		train_parser.set_defaults(mode='train')

		predict_parser = subparsers.add_parser('predict')
		predict_parser.set_defaults(mode='predict')
		predict_parser.add_argument('--mean', type=float, default=0, help='error gaussian distribution mean')
		predict_parser.add_argument('--std', type=float, default=0, help='error gaussian distribution std')

		vocab_parser = subparsers.add_parser('vocab')
		vocab_parser.set_defaults(mode='vocab')

		args = parser.parse_args()
		print("arguments", args)

		if args.mode == 'train':
		train()

		elif args.mode == 'predict':
		predict()

		elif args.mode == 'vocab':
		with open(options["train_vocab"], 'r') as f:
		logs = f.readlines()
		vocab = Vocab(logs)
		print("vocab_size", len(vocab))
		vocab.save_vocab(options["vocab_path"])