提交 58629fc4 编辑于 作者: openaiops's avatar openaiops
浏览文件

Initial commit

上级
加载中
加载中
加载中
加载中
加载中

..gitignore.un~

0 → 100644
+1.8 KB

添加文件。

此文件类型的文件没有差异预览。

.gitignore

0 → 100644
+131 −0
原始行号 差异行号 差异行
*.csv
*.log
.ipynb_checkpoints
output/*

# Editors
.vscode/
.idea/

# Vagrant
.vagrant/

# Mac/OSX
.DS_Store

# Windows
Thumbs.db

# Source for the following rules: https://raw.githubusercontent.com/github/gitignore/master/Python.gitignore
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

BGL/baselines.ipynb

0 → 100644
+226 −0
原始行号 差异行号 差异行
%% Cell type:code id: tags:

``` python
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys
sys.path.append('../')

import argparse
import numpy as np
import pandas as pd
import random
from importlib import reload
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.utils import shuffle

from loglizer.models import InvariantsMiner, PCA, IsolationForest, OneClassSVM, LogClustering, LR, SVM
from loglizer import dataloader, preprocessing
from loglizer.utils import metrics
```

%% Cell type:code id: tags:

``` python
ouput_dir = "../output/bgl/"
middle_dir = ""
log_file = "BGL.log"
```

%% Cell type:markdown id: tags:

<!-- # Produce event templates from train test dataset -->

%% Cell type:markdown id: tags:

# Split train test data

%% Cell type:code id: tags:

``` python
(x_train, y_train), (x_test, y_test) = dataloader.load_data(ouput_dir, middle_dir, log_file, is_mapping=True)
```

%% Output

    /home/haixuanguo/Documents/deeplog_copy_github/BGL/../loglizer/dataloader.py:286: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.
      train = np.array(train).reshape(-1,1)
    /home/haixuanguo/Documents/deeplog_copy_github/BGL/../loglizer/dataloader.py:292: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.
      test_normal = np.array(test_normal).reshape(-1,1)
    /home/haixuanguo/Documents/deeplog_copy_github/BGL/../loglizer/dataloader.py:298: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.
      abnormal = np.array(abnormal).reshape(-1,1)

    Train normal size: 13718
    Train abnormal size: 1207
    Total logkey(exclude 0:UNK) 1000
    Test normal size: 20579
    Test abnormal size: 1811
    num_unk_event in test data: 0

%% Cell type:code id: tags:

``` python
feature_extractor = preprocessing.FeatureExtractor()
x_train = feature_extractor.fit_transform(x_train)
x_test = feature_extractor.transform(x_test)
```

%% Output

    ====== Transformed train data summary ======
    Train data shape: 14925-by-832
    
    ====== Transformed test data summary ======
    Test data shape: 22390-by-832
    

%% Cell type:code id: tags:

``` python
%%time
print("="*20 + " Model: PCA " + "="*20)
for th in np.arange(1):
    print("theshold", th)
    model = PCA(n_components=0.8, threshold=1, c_alpha = 1.9600)
    model.fit(x_train)
    print('Train validation:')
    precision, recall, f1 = model.evaluate(x_train, y_train)
    print('Test validation:')
    precision, recall, f1 = model.evaluate(x_test, y_test)
```

%% Output

    ==================== Model: PCA ====================
    theshold 0
    ====== Model summary ======
    n_components: 5
    Project matrix shape: 832-by-832
    SPE threshold: 1
    
    Train validation:
    ====== Evaluation summary ======
    Confusion Matrix: TP: 1193, FP: 11915, TN: 1803, FN: 14
    Precision: 9.101%, recall: 98.840%, F1-measure: 16.668%
    
    Test validation:
    ====== Evaluation summary ======
    Confusion Matrix: TP: 1777, FP: 17824, TN: 2755, FN: 34
    Precision: 9.066%, recall: 98.123%, F1-measure: 16.598%
    
    CPU times: user 16.9 s, sys: 66.9 ms, total: 17 s
    Wall time: 1.73 s

%% Cell type:code id: tags:

``` python
%%time
print("="*20 + " Model: IsolationForest " + "="*20)
model = IsolationForest(n_estimators=100, max_samples='auto', contamination='auto', random_state=19)
model.fit(x_train)
print('Train validation:')
precision, recall, f1 = model.evaluate(x_train, y_train)
print('Test validation:')
precision, recall, f1 = model.evaluate(x_test, y_test)
```

%% Output

    ==================== Model: IsolationForest ====================
    ====== Model summary ======
    Train validation:
    ====== Evaluation summary ======
    Confusion Matrix: TP: 174, FP: 0, TN: 13718, FN: 1033
    Precision: 100.000, recall: 14.416, F1-measure: 25.199
    
    Test validation:
    ====== Evaluation summary ======
    Confusion Matrix: TP: 258, FP: 0, TN: 20579, FN: 1553
    Precision: 100.000, recall: 14.246, F1-measure: 24.940
    
    CPU times: user 17.2 s, sys: 2.96 s, total: 20.2 s
    Wall time: 18.4 s

%% Cell type:code id: tags:

``` python
%%time
print("="*20 + " Model: one class SVM " + "="*20)
model = OneClassSVM(kernel='rbf')
model.fit(x_train, y_train)

print('Train validation:')
precision, recall, f1 = model.evaluate(x_train, y_train)
print('Test validation:')
precision, recall, f1 = model.evaluate(x_test, y_test)
```

%% Output

    ==================== Model: one class SVM ====================
    ====== Model summary ======
    Train validation:
    ====== Evaluation summary ======
    Confusion Matrix: TP: 152, FP: 13718, TN: 0, FN: 1055
    Precision: 1.096, recall: 12.593, F1-measure: 2.016
    
    Test validation:
    ====== Evaluation summary ======
    Confusion Matrix: TP: 227, FP: 20579, TN: 0, FN: 1584
    Precision: 1.091, recall: 12.534, F1-measure: 2.007
    
    CPU times: user 6min 39s, sys: 69.4 ms, total: 6min 39s
    Wall time: 6min 39s

%% Cell type:code id: tags:

``` python
%%time
print("="*20 + " Model: LogClustering " + "="*20)
max_dist = 0.3  # the threshold to stop the clustering process
anomaly_threshold = 0.3  # the threshold for anomaly detection
model = LogClustering(max_dist=max_dist, anomaly_threshold=anomaly_threshold)
model.fit(x_train[y_train == 0, :])  # Use only normal samples for training
print('Train validation:')
precision, recall, f1 = model.evaluate(x_train, y_train)
print('Test validation:')
precision, recall, f1 = model.evaluate(x_test, y_test)
```

%% Output

    ==================== Model: LogClustering ====================
    ====== Model summary ======
    Starting offline clustering...
    Processed 1000 instances.
    Found 92 clusters offline.
    
    Starting online clustering...
    Processed 2000 instances.
    Processed 4000 instances.
    Processed 6000 instances.
    Processed 8000 instances.
    Processed 10000 instances.
    Processed 12000 instances.
    Processed 13718 instances.
    Found 172 clusters online.
    
    Train validation:
    ====== Evaluation summary ======
    Confusion Matrix: TP: 775, FP: 1, TN: 13717, FN: 432
    Precision: 99.871, recall: 64.209, F1-measure: 78.164
    
    Test validation:
    ====== Evaluation summary ======
    Confusion Matrix: TP: 1215, FP: 64, TN: 20515, FN: 596
    Precision: 94.996, recall: 67.090, F1-measure: 78.641
    
    CPU times: user 1min 42s, sys: 28.1 ms, total: 1min 42s
    Wall time: 1min 42s

%% Cell type:code id: tags:

``` python
```

BGL/data_process.py

0 → 100644
+183 −0
原始行号 差异行号 差异行
import sys
sys.path.append('../')

import os
import gc
import pandas as pd
import numpy as np
from logparser import Spell, Drain
import argparse
from tqdm import tqdm
from logdeep.dataset.session import sliding_window

tqdm.pandas()
pd.options.mode.chained_assignment = None

PAD = 0
UNK = 1
START = 2

data_dir = os.path.expanduser("~/.dataset/bgl")
output_dir = "../output/bgl/"
log_file = "BGL.log"


# In the first column of the log, "-" indicates non-alert messages while others are alert messages.
def count_anomaly():
    total_size = 0
    normal_size = 0
    with open(data_dir + log_file, encoding="utf8") as f:
        for line in f:
            total_size += 1
            if line.split(' ',1)[0] == '-':
                normal_size += 1
    print("total size {}, abnormal size {}".format(total_size, total_size - normal_size))


# def deeplog_df_transfer(df, features, target, time_index, window_size):
#     """
#     :param window_size: offset datetime https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects
#     :return:
#     """
#     agg_dict = {target:'max'}
#     for f in features:
#         agg_dict[f] = _custom_resampler
#
#     features.append(target)
#     features.append(time_index)
#     df = df[features]
#     deeplog_df = df.set_index(time_index).resample(window_size).agg(agg_dict).reset_index()
#     return deeplog_df
#
#
# def _custom_resampler(array_like):
#     return list(array_like)


def deeplog_file_generator(filename, df, features):
    with open(filename, 'w') as f:
        for _, row in df.iterrows():
            for val in zip(*row[features]):
                f.write(','.join([str(v) for v in val]) + ' ')
            f.write('\n')


def parse_log(input_dir, output_dir, log_file, parser_type):
    log_format = '<Label> <Id> <Date> <Code1> <Time> <Code2> <Component1> <Component2> <Level> <Content>'
    regex = [
        r'(0x)[0-9a-fA-F]+', #hexadecimal
        r'\d+.\d+.\d+.\d+',
        # r'/\w+( )$'
        r'\d+'
    ]
    keep_para = False
    if parser_type == "drain":
        # the hyper parameter is set according to http://jmzhu.logpai.com/pub/pjhe_icws2017.pdf
        st = 0.3  # Similarity threshold
        depth = 3  # Depth of all leaf nodes
        parser = Drain.LogParser(log_format, indir=input_dir, outdir=output_dir, depth=depth, st=st, rex=regex, keep_para=keep_para)
        parser.parse(log_file)
    elif parser_type == "spell":
        tau = 0.55
        parser = Spell.LogParser(indir=data_dir, outdir=output_dir, log_format=log_format, tau=tau, rex=regex, keep_para=keep_para)
        parser.parse(log_file)

#
# def merge_list(time, activity):
#     time_activity = []
#     for i in range(len(activity)):
#         temp = []
#         assert len(time[i]) == len(activity[i])
#         for j in range(len(activity[i])):
#             temp.append(tuple([time[i][j], activity[i][j]]))
#         time_activity.append(np.array(temp))
#     return time_activity


if __name__ == "__main__":
    #
    #
    # parser = argparse.ArgumentParser()
    # parser.add_argument('-p', default=None, type=str, help="parser type")
    # parser.add_argument('-w', default='T', type=str, help='window size(mins)')
    # parser.add_argument('-s', default='1', type=str, help='step size(mins)')
    # parser.add_argument('-r', default=0.4, type=float, help="train ratio")
    # args = parser.parse_args()
    # print(args)
    #

    ##########
    # Parser #
    #########

    parse_log(data_dir, output_dir, log_file, 'drain')

    #########
    # Count #
    #########
    # count_anomaly()

    ##################
    # Transformation #
    ##################
    # mins
    window_size = 5
    step_size = 1
    train_ratio = 0.4

    df = pd.read_csv(f'{output_dir}{log_file}_structured.csv')

    # data preprocess
    df['datetime'] = pd.to_datetime(df['Time'], format='%Y-%m-%d-%H.%M.%S.%f')
    df["Label"] = df["Label"].apply(lambda x: int(x != "-"))
    df['timestamp'] = df["datetime"].values.astype(np.int64) // 10 ** 9
    df['deltaT'] = df['datetime'].diff() / np.timedelta64(1, 's')
    df['deltaT'].fillna(0)
    # convert time to UTC timestamp
    # df['deltaT'] = df['datetime'].apply(lambda t: (t - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s'))

    # sampling with fixed window
    # features = ["EventId", "deltaT"]
    # target = "Label"
    # deeplog_df = deeplog_df_transfer(df, features, target, "datetime", window_size=args.w)
    # deeplog_df.dropna(subset=[target], inplace=True)

    # sampling with sliding window
    deeplog_df = sliding_window(df[["timestamp", "Label", "EventId", "deltaT"]],
                                para={"window_size": int(window_size)*60, "step_size": int(step_size) * 60}
                                )

    #########
    # Train #
    #########
    df_normal =deeplog_df[deeplog_df["Label"] == 0]
    df_normal = df_normal.sample(frac=1, random_state=12).reset_index(drop=True) #shuffle
    normal_len = len(df_normal)
    train_len = int(normal_len * train_ratio)

    train = df_normal[:train_len]
    # deeplog_file_generator(os.path.join(output_dir,'train'), train, ["EventId", "deltaT"])
    deeplog_file_generator(os.path.join(output_dir,'train'), train, ["EventId"])

    print("training size {}".format(train_len))


    ###############
    # Test Normal #
    ###############
    test_normal = df_normal[train_len:]
    deeplog_file_generator(os.path.join(output_dir, 'test_normal'), test_normal, ["EventId"])
    print("test normal size {}".format(normal_len - train_len))

    del df_normal
    del train
    del test_normal
    gc.collect()

    #################
    # Test Abnormal #
    #################
    df_abnormal = deeplog_df[deeplog_df["Label"] == 1]
    #df_abnormal["EventId"] = df_abnormal["EventId"].progress_apply(lambda e: event_index_map[e] if event_index_map.get(e) else UNK)
    deeplog_file_generator(os.path.join(output_dir,'test_abnormal'), df_abnormal, ["EventId"])
    print('test abnormal size {}'.format(len(df_abnormal)))

BGL/deeplog.py

0 → 100644
+126 −0
原始行号 差异行号 差异行
# -*- coding: utf-8 -*-
import platform
import argparse
import sys
sys.path.append('../')

from logdeep.models.lstm import *
from logdeep.tools.predict import Predicter
from logdeep.tools.train import Trainer
from logdeep.tools.utils import *
from logdeep.dataset.vocab import Vocab

import torch

output_dir = "../output/bgl/"

# Config Parameters
options = dict()
options['output_dir'] = output_dir
options['train_vocab'] = output_dir + 'train'
options["vocab_path"] = output_dir + "vocab.pkl"

options['device'] = 'cuda' if torch.cuda.is_available() else 'cpu'

# Smaple
options['sample'] = "sliding_window"
options['window_size'] = 20  # if fix_window
options['train_ratio'] = 1
options['valid_ratio'] = 0.1
options['test_ratio'] = 1
options["min_len"] = 10

options["is_logkey"] = True
options["is_time"] = False

# Features
options['sequentials'] = options["is_logkey"]
options['quantitatives'] = False
options['semantics'] = False
options['parameters'] = options["is_time"]
options['feature_num'] = sum(
    [options['sequentials'], options['quantitatives'], options['semantics'], options['parameters']])

# Model
options['input_size'] = 1
options['hidden_size'] = 64
options['num_layers'] = 2
options["embedding_dim"] = 50
options["vocab_size"] = 200
options['num_classes'] = options["vocab_size"]
# Train
options['batch_size'] = 128
options['accumulation_step'] = 1

options['optimizer'] = 'adam'
options['lr'] = 0.01
options['max_epoch'] = 200
options["n_epochs_stop"] = 10
options['lr_step'] = (options['max_epoch'] - 20, options['max_epoch'])
options['lr_decay_ratio'] = 0.1

options['resume_path'] = None
options['model_name'] = "deeplog"
options['save_dir'] = options["output_dir"] + "deeplog/"

# Predict
options['model_path'] = options["save_dir"] + "bestloss.pth"
options['num_candidates'] = 9
options["threshold"] = None
options["gaussian_mean"] = 0
options["gaussian_std"] = 0
options["num_outputs"] = 1


print("Features logkey:{} time: {}".format(options["is_logkey"], options["is_time"]))
print("Device:", options['device'])

seed_everything(seed=1234)

Model = Deeplog(input_size=options['input_size'],
                hidden_size=options['hidden_size'],
                num_layers=options['num_layers'],
                vocab_size=options["vocab_size"],
                embedding_dim=options["embedding_dim"])


def train():
    trainer = Trainer(Model, options)
    trainer.start_train()


def predict():
    predicter = Predicter(Model, options)
    predicter.predict_unsupervised()


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    subparsers = parser.add_subparsers()

    train_parser = subparsers.add_parser('train')
    train_parser.set_defaults(mode='train')

    predict_parser = subparsers.add_parser('predict')
    predict_parser.set_defaults(mode='predict')
    predict_parser.add_argument('--mean', type=float, default=0, help='error gaussian distribution mean')
    predict_parser.add_argument('--std', type=float, default=0, help='error gaussian distribution std')

    vocab_parser = subparsers.add_parser('vocab')
    vocab_parser.set_defaults(mode='vocab')

    args = parser.parse_args()
    print("arguments", args)

    if args.mode == 'train':
        train()

    elif args.mode == 'predict':
        predict()

    elif args.mode == 'vocab':
        with open(options["train_vocab"], 'r') as f:
            logs = f.readlines()
        vocab = Vocab(logs)
        print("vocab_size", len(vocab))
        vocab.save_vocab(options["vocab_path"])