提交 3bfd88e8 编辑于 作者: openaiops's avatar openaiops
浏览文件

Initial commit

上级
加载中
加载中
加载中
加载中
加载中

LICENSE

0 → 100644
+22 −0
原始行号 差异行号 差异行
MIT License

Copyright (c) 2021 hanxiao0607

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
 No newline at end of file

README.md

0 → 100644
+50 −0
原始行号 差异行号 差异行
[![License](https://img.shields.io/badge/License-MIT-red.svg)](https://github.com/hanxiao0607/LogTAD/blob/main/LICENSE)
![Python 3.9](https://img.shields.io/badge/python-3.9-blue.svg)
[![Hits](https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fgithub.com%2Fhanxiao0607%2FLogTAD&count_bg=%2379C83D&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=hits&edge_flat=false)](https://hits.seeyoufarm.com)

# LogTAD: Unsupervised Cross-system Log Anomaly Detection via Domain Adaptation
A Pytorch implementation of [LogTAD](https://dl.acm.org/doi/abs/10.1145/3459637.3482209).

## Configuration
- Ubuntu 20.04
- NVIDIA driver 460.73.01 
- CUDA 11.2
- Python 3.9
- PyTorch 1.9.0

## Installation
This code requires the packages listed in requirements.txt.
A virtual environment is recommended to run this code

On macOS and Linux:  
```
python3 -m pip install --user virtualenv
python3 -m venv env
source env/bin/activate
pip install -r requirements.txt
deactivate
```
Reference: https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/

## Instructions
LogTAD and other baseline models are implemented on [BGL](https://github.com/logpai/loghub/tree/master/BGL) and [Thunderbird](https://github.com/logpai/loghub/tree/master/Thunderbird) datasets

Clone the template project, replacing ``my-project`` with the name of the project you are creating:

        git clone https://github.com/hanxiao0607/LogTAD.git my-project
        cd my-project

Run and test:

        python3 main_LogTAD.py

## Citation
```
@inproceedings{han2021unsupervised,
  title={Unsupervised Cross-system Log Anomaly Detection via Domain Adaptation},
  author={Han, Xiao and Yuan, Shuhan},
  booktitle={Proceedings of the 30th ACM International Conference on Information \& Knowledge Management},
  pages={3068--3072},
  year={2021}
}
```

main_LogTAD.py

0 → 100644
+89 −0
原始行号 差异行号 差异行
import os
from utils import preprocessing, SlidingWindow
from utils.utils import set_seed, get_train_eval_iter
from argparse import ArgumentParser
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from model.LogTAD import LogTAD

def arg_parser():
    """
    Add parser parameters
    :return:
    """
    parser = ArgumentParser()
    parser.add_argument("--source_dataset_name", help="please choose source dataset name from BGL or Thunderbird", default='Thunderbird')
    parser.add_argument("--target_dataset_name", help="please choose target dataset name from BGL or Thunderbird", default='BGL')
    parser.add_argument("--device", help="hardware device", default="cuda")
    parser.add_argument("--output_dir", metavar="DIR", help="output directory", default="/Dataset")
    parser.add_argument("--model_dir", metavar="DIR", help="model directory", default="/Dataset")
    parser.add_argument("--random_seed", help="random seed", default=42)
    parser.add_argument("--download_datasets", help="download datasets or not", default=1)

    # training parameters
    parser.add_argument("--max_epoch", help="epochs", default=100)
    parser.add_argument("--batch_size", help="batch size", default=1024)
    parser.add_argument("--lr", help="learning rate", default=0.001)
    parser.add_argument("--weight_decay", help="weight decay", default=1e-6)
    parser.add_argument("--eps", help="minimum center value", default=0.1)

    # word2vec parameters
    parser.add_argument("--emb_dim", help="word2vec vector size", default=300)

    # data preprocessing parameters
    parser.add_argument("--window_size", help="size of sliding window", default=20)
    parser.add_argument("--step_size", help="step size of sliding window", default=4)
    parser.add_argument("--train_size_s", help="source training size", default=100000)
    parser.add_argument("--train_size_t", help="target training size", default=1000)

    # LSTM parameters
    parser.add_argument("--hid_dim", help="hidden dimensions", default=128)
    parser.add_argument("--out_dim", help="output dimensions", default=2)
    parser.add_argument("--n_layers", help="layers of LSTM", default=2)
    parser.add_argument("--dropout", help="dropout", default=0.3)
    parser.add_argument("--bias", help="bias for LSTM", default=True)

    # gradient reversal parameters
    parser.add_argument("--alpha", help="alpha value for the gradient reversal layer", default=0.1)

    #test parameters
    parser.add_argument("--test_ratio", help="testing ratio", default=0.1)

    return parser

def main():
    parser = arg_parser()
    args = parser.parse_args()

    options = vars(args)

    set_seed(options["random_seed"])
    print(f"Set seed: {options['random_seed']}")
    if options["download_datasets"] == 1:
        preprocessing.parsing(options["source_dataset_name"], options["output_dir"])
        preprocessing.parsing(options["target_dataset_name"], options["output_dir"])

    path = "./Dataset"
    if len(os.listdir(path)) == 0:
        print("Please download the dataset first")
        return 1

    df_source = pd.read_csv(f'./Dataset/{options["source_dataset_name"]}.log_structured.csv')
    print(f'Reading source dataset: {options["source_dataset_name"]} dataset')
    df_target = pd.read_csv(f'./Dataset/{options["target_dataset_name"]}.log_structured.csv')
    print(f'Reading target dataset: {options["target_dataset_name"]} dataset')
    train_normal_s, test_normal_s, test_abnormal_s, r_s_val_df, train_normal_t, test_normal_t, test_abnormal_t, r_t_val_df, w2v = SlidingWindow.get_datasets(df_source, df_target, options)
    train_iter, test_iter = get_train_eval_iter(train_normal_s, train_normal_t)
    demo_logtad = LogTAD(options)
    demo_logtad.train_LogTAD(train_iter, test_iter, w2v)
    demo_logtad.load_model()
    R_src, _ = demo_logtad.get_r_from_val(r_s_val_df)
    R_trg, _ = demo_logtad.get_r_from_val(r_t_val_df)
    print(f'Starting to test source dataset: {options["source_dataset_name"]}')
    demo_logtad.testing(test_normal_s, test_abnormal_s, R_src)
    print(f'Starting to test target dataset: {options["target_dataset_name"]}')
    demo_logtad.testing(test_normal_t, test_abnormal_t, R_trg)

if __name__ == "__main__":
    main()
+41 −0
原始行号 差异行号 差异行
import torch.nn as nn
import torch
from torch.autograd import Function

class GRL(Function):

    @staticmethod
    def forward(ctx, x, alpha):
        ctx.alpha = alpha

        return x.view_as(x)

    @staticmethod
    def backward(ctx, grad_output):
        output = grad_output.neg() * ctx.alpha

        return output, None

class DA_LSTM(nn.Module):

    def __init__(self, emb_dim, hid_dim, output_dim, n_layers, dropout, bias):
        super().__init__()
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.output_dim = output_dim
        self.n_layers = n_layers
        self.dropout = dropout
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout, bidirectional=False, batch_first=True,
                           bias=bias)

        self.discriminator = nn.Sequential(
            nn.Linear(hid_dim, 64),
            nn.Linear(64, output_dim)
        )

    def forward(self, input, alpha):
        output, (hidden, cell) = self.rnn(input)
        y = GRL.apply(torch.mean(output, dim=1), alpha)
        y = self.discriminator(y)
        return torch.mean(output, dim=1), y
 No newline at end of file

model/LogTAD.py

0 → 100644
+260 −0
原始行号 差异行号 差异行
from . import DomainAdversarial
import torch.nn as nn
import torch.optim as optim
import torch
from utils.utils import get_center, epoch_time, get_dist, dist2label, get_iter
import numpy as np
from tqdm import tqdm
import time
import pandas as pd
from gensim.models import Word2Vec
from sklearn import metrics

class LogTAD(nn.Module):
    def __init__(self, options):
        super().__init__()
        self.emb_dim = options["emb_dim"]
        self.hid_dim = options["hid_dim"]
        self.output_dim = options["out_dim"]
        self.n_layers = options["n_layers"]
        self.dropout = options["dropout"]
        self.bias = options["bias"]
        self.device = options["device"]
        self.weight_decay = options["weight_decay"]
        self.window_size = options["window_size"]
        self.step_size = options["step_size"]
        self.encoder = DomainAdversarial.DA_LSTM(self.emb_dim, self.hid_dim, self.output_dim, self.n_layers, self.dropout, self.bias).to(self.device)
        self.optimizer = optim.Adam(self.encoder.parameters(), weight_decay=self.weight_decay)
        self.alpha = options["alpha"]
        self.max_epoch = options["max_epoch"]
        self.eps = options["eps"]
        self.source_dataset_name = options["source_dataset_name"]
        self.target_dataset_name = options["target_dataset_name"]
        self.test_ratio = options["test_ratio"]
        self.loss_mse = nn.MSELoss()
        self.loss_cel = nn.CrossEntropyLoss()
        self.w2v = None
        self.center = None

    def _train(self, iterator, center):

        self.encoder.train()

        epoch_loss = 0

        for (i, batch) in enumerate(iterator):
            src = batch[0].to(self.device)
            domain_label = batch[1].to(self.device)
            labels = batch[2]
            self.optimizer.zero_grad()
            output, y_d = self.encoder(src, self.alpha)

            domain_label = domain_label.view(-1)
            center = center.to(self.device)

            mse = 0
            for (ind, val) in enumerate(output):
                if labels[ind] == 1:
                    mse += (10 - self.loss_mse(val, center))
                else:
                    mse += self.loss_mse(val, center)
            cel = self.loss_cel(y_d, domain_label.to(dtype=torch.long))
            loss = mse * 10e4 + cel
            loss.backward()

            self.optimizer.step()

            epoch_loss += loss.item()

            center.cpu()
            src.cpu()
            domain_label.cpu()
            output.cpu()
            y_d.cpu()

        return epoch_loss / len(iterator)

    def _evaluate(self, iterator, center, epoch):

        self.encoder.eval()

        epoch_loss = 0

        lst_dist = []

        lst_mse = []
        lst_cel = []

        with torch.no_grad():
            for (i, batch) in enumerate(iterator):
                src = batch[0].to(self.device)
                domain_label = batch[1].to(self.device)
                labels = batch[2]
                output, y_d = self.encoder(src, self.alpha)
                if i == 0:
                    lst_emb = output
                else:
                    lst_emb = torch.cat((lst_emb, output), dim=0)

                domain_label = domain_label.view(-1)

                center = center.to(self.device)

                mse = 0
                for (ind, val) in enumerate(output):
                    if labels[ind] == 1:
                        mse += (10 - self.loss_mse(val, center))
                    else:
                        mse += self.loss_mse(val, center)

                cel = self.loss_cel(y_d, domain_label.to(dtype=torch.long))

                lst_mse.append(mse.detach().cpu().numpy())
                lst_cel.append(cel.detach().cpu().numpy())

                loss = mse * 10e4 + cel

                epoch_loss += loss.item()

                lst_dist.extend(get_dist(output, center))

                src.cpu()
                domain_label.cpu()
                lst_emb.cpu()
                output.cpu()
                y_d.cpu()

        if epoch < 10:
            center = get_center(lst_emb)
            print('get center:', center)
            center[(abs(center) < self.eps) & (center < 0)] = -self.eps
            center[(abs(center) < self.eps) & (center > 0)] = self.eps
            print('new center', center)

        print('\nmse:', np.mean(np.array(lst_mse)))
        print('cel:', np.mean(np.array(lst_cel)))
        return epoch_loss / len(iterator), center, lst_dist

    def train_LogTAD(self, train_iter, eval_iter, w2v):

        best_eval_loss = float('inf')

        for epoch in tqdm(range(self.max_epoch)):

            if epoch == 0:
                center = torch.Tensor([0.0 for _ in range(self.hid_dim)])
            if epoch > 9:
                center = fixed_center
            start_time = time.time()
            train_loss = self._train(train_iter, center)

            eval_loss, center, _ = self._evaluate(eval_iter, center, epoch)

            if epoch == 9:
                fixed_center = center

            end_time = time.time()

            epoch_mins, epoch_secs = epoch_time(start_time, end_time)

            if eval_loss < best_eval_loss and epoch >= 9:
                best_eval_loss = eval_loss
                torch.save(self.encoder.state_dict(), f'./saved_model/{self.source_dataset_name}-{self.target_dataset_name}.pt')

                self.center = fixed_center.cpu()
                pd.DataFrame(fixed_center.cpu().numpy()).to_csv(f'./saved_model/{self.source_dataset_name}-{self.target_dataset_name}_center.csv')

            print(f'Epoch: {epoch + 1:02} | Time: {epoch_mins}m {epoch_secs}s')
            print(f'\tTrain Loss: {train_loss:.10f}')
            print(f'\t Val. Loss: {eval_loss:.10f}')
        self.w2v = w2v
        w2v.save(f'./saved_model/{self.source_dataset_name}-{self.target_dataset_name}_w2v.bin')

    def load_model(self):
        self.w2v = Word2Vec.load(f'./saved_model/{self.source_dataset_name}-{self.target_dataset_name}_w2v.bin')
        self.encoder.load_state_dict(torch.load(f'./saved_model/{self.source_dataset_name}-{self.target_dataset_name}.pt'))
        self.encoder.to(self.device)
        self.center = torch.Tensor(
            pd.read_csv(f'./saved_model/{self.source_dataset_name}-{self.target_dataset_name}_center.csv', index_col=0).iloc[:, 0])

    def _test(self, iterator):
        self.encoder.eval()

        y = []
        lst_dist = []

        with torch.no_grad():
            for (i, batch) in enumerate(iterator):
                src = batch[0].to(self.device)
                label = batch[2]
                output, _ = self.encoder(src, self.alpha)
                for j in label:
                    y.append(int(j))
                lst_dist.extend(get_dist(output, self.center))
                src.cpu()
        return y, lst_dist

    def get_best_r(self, iterator, steps=100):
        y, lst_dist = self._test(iterator)
        df = pd.DataFrame()
        df['label'] = y
        df['dist'] = lst_dist
        print(df.groupby(['label']).describe())
        mean_normal = np.mean(df['dist'].loc[df['label'] == 0])
        mean_abnormal = np.mean(df['dist'].loc[df['label'] == 1])
        step_len = (mean_abnormal - mean_normal)/steps
        best_r = 0
        best_auc = -1
        R = mean_normal
        for i in range(steps):
            y_pre = dist2label(lst_dist, R)
            auc = metrics.roc_auc_score(y, y_pre)
            if auc > best_auc:
                best_r = R
                best_auc = auc
            R += step_len
        return best_r, best_auc

    def get_r_from_val(self, val_df):
        X = list(val_df.Embedding)
        X_new = []
        for i in X:
            temp = []
            for j in i:
                temp.extend(j)
            X_new.append(np.array(temp).reshape(self.window_size, self.emb_dim))
        y_d = list(val_df.target.values)
        y = list(val_df.Label.values)
        X = torch.tensor(X_new, requires_grad=False)
        y_d = torch.tensor(y_d).reshape(-1, 1).long()
        y = torch.tensor(y).reshape(-1, 1).long()
        iterator = get_iter(X, y_d, y)
        R, auc = self.get_best_r(iterator)
        return R, auc

    def testing(self, test_normal_df, test_abnormal_df, r, target=0):
        X = list(test_normal_df.Embedding.values[::int(1 / self.test_ratio)])
        X.extend(list(test_abnormal_df.Embedding.values[::int(1 / self.test_ratio)]))
        X_new = []
        for i in tqdm(X):
            temp = []
            for j in i:
                temp.extend(j)
            X_new.append(np.array(temp).reshape(self.window_size, self.emb_dim))
        y_d = list(test_normal_df.target.values[::int(1 / self.test_ratio)])
        y_d.extend(list(test_abnormal_df.target.values[::int(1 / self.test_ratio)]))
        y = list(test_normal_df.Label.values[::int(1 / self.test_ratio)])
        y.extend(list(test_abnormal_df.Label.values[::int(1 / self.test_ratio)]))
        X_test = torch.tensor(X_new, requires_grad=False)
        y_d_test = torch.tensor(y_d).reshape(-1, 1).long()
        y_test = torch.tensor(y).reshape(-1, 1).long()
        test_iter = get_iter(X_test, y_d_test, y_test)
        y, lst_dist = self._test(test_iter)
        y_pred = dist2label(lst_dist, r)
        if target:
            print(f'Testing result for {self.target_dataset_name}:\n')
        else:
            print(f'Testing result for {self.source_dataset_name}:\n')

        print('Accuracy:', metrics.accuracy_score(y, y_pred))
        print(metrics.classification_report(y, y_pred, digits=5))