提交 de9728ee 编辑于 作者: openaiops's avatar openaiops
浏览文件

Initial commit

上级
加载中
加载中
加载中
加载中
加载中

NuLog.py

0 → 100644
+0 −0

添加文件。

预览已超出大小限制,变更已折叠。

README.md

0 → 100644
+54 −0
原始行号 差异行号 差异行
## NuLog

Parsing semi-structured records with free-form text log messages into structured templates is the first and crucial step that enables further analysis. NuLog presents a novel parsing technique that utilizes a self-supervised learning model and formulates the parsing task as masked language modeling (MLM). In the process of parsing, the model extracts summarizations from the logs in the form of a vector embedding. This allows the coupling of the MLM as pre-training with a downstream anomaly detection task. 

Read more information about Brain from the following papers:

+ Sasho Nedelkoski, Jasmin Bogatinovski, Alexander Acker, Jorge Cardoso, Odej Kao. [Self-Supervised Log Parsing](https://arxiv.org/abs/2003.07905), *Joint European Conference on Machine Learning and Knowledge Discovery in Databases (ECML-PKDD)*, 2020.


### Running

Note that we modify NuLog to support both CPU and GPU devices. We run the experiments on a P100 GPU machine.

Install the required enviornment:

```
pip install -r requirements.txt
```

Run the following script to start the demo:

```
python demo.py
```

Run the following script to execute the benchmark:

```
python benchmark.py
```

### Benchmark

Running the benchmark script on Loghub_2k datasets, you could obtain the following results.

|   Dataset   | F1_measure | Accuracy |
|:-----------:|:----------|:---------|
| BGL | 0.999779 | 0.9785 |
| Android | 0.972805 | 0.831 |
| OpenStack | 0.999856 | 0.968 |
| HDFS | 0.99998 | 0.9965 |
| Apache | 1 | 1 |
| HPC | 0.994403 | 0.9465 |
| Windows | 0.999983 | 0.9945 |
| HealthApp | 0.996484 | 0.8765 |
| Mac | 0.748933 | 0.8165 |
| Spark | 0.999996 | 0.998 |

### Citation

:telescope: If you use our logparser tools or benchmarking results in your publication, please kindly cite the following papers.

+ [**ICSE'19**] Jieming Zhu, Shilin He, Jinyang Liu, Pinjia He, Qi Xie, Zibin Zheng, Michael R. Lyu. [Tools and Benchmarks for Automated Log Parsing](https://arxiv.org/pdf/1811.03509.pdf). *International Conference on Software Engineering (ICSE)*, 2019.
+ [**DSN'16**] Pinjia He, Jieming Zhu, Shilin He, Jian Li, Michael R. Lyu. [An Evaluation Study on Log Parsing and Its Use in Log Mining](https://jiemingzhu.github.io/pub/pjhe_dsn2016.pdf). *IEEE/IFIP International Conference on Dependable Systems and Networks (DSN)*, 2016.

__init__.py

0 → 100644
+1 −0
原始行号 差异行号 差异行
from .NuLog import *

benchmark.py

0 → 100644
+141 −0
原始行号 差异行号 差异行
# =========================================================================
# Copyright (C) 2016-2023 LOGPAI (https://github.com/logpai).
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# =========================================================================


import sys
sys.path.append("../../")
from logparser.NuLog import LogParser
from logparser.utils import evaluator
import os
import pandas as pd


input_dir = "../../data/loghub_2k/"  # The input directory of log file
output_dir = "NuLog_result/"  # The output directory of parsing results


benchmark_settings = {
    "BGL": {
        "log_file": "BGL/BGL_2k.log",
        "log_format": "<Label> <Timestamp> <Date> <Node> <Time> <NodeRepeat> <Type> <Component> <Level> <Content>",
        "filters": "([ |:|\(|\)|=|,])|(core.)|(\.{2,})",
        "k": 50,
        "nr_epochs": 3,
        "num_samples": 0,
    },
    "Android": {
        "log_file": "Android/Android_2k.log",
        "log_format": "<Date> <Time>  <Pid>  <Tid> <Level> <Component>: <Content>",
        "filters": '([ |:|\(|\)|=|,|"|\{|\}|@|$|\[|\]|\||;])',
        "k": 25,
        "nr_epochs": 5,
        "num_samples": 5000,
    },
    "OpenStack": {
        "log_file": "OpenStack/OpenStack_2k.log",
        "log_format": "<Logrecord> <Date> <Time> <Pid> <Level> <Component> \[<ADDR>\] <Content>",
        "filters": '([ |:|\(|\)|"|\{|\}|@|$|\[|\]|\||;])',
        "k": 5,
        "nr_epochs": 6,
        "num_samples": 0,
    },
    "HDFS": {
        "log_file": "HDFS/HDFS_2k.log",
        "log_format": "<Date> <Time> <Pid> <Level> <Component>: <Content>",
        "filters": "(\s+blk_)|(:)|(\s)",
        "k": 15,
        "nr_epochs": 5,
        "num_samples": 0,
    },
    "Apache": {
        "log_file": "Apache/Apache_2k.log",
        "log_format": "\[<Time>\] \[<Level>\] <Content>",
        "filters": "([ ])",
        "k": 12,
        "nr_epochs": 5,
        "num_samples": 0,
    },
    "HPC": {
        "log_file": "HPC/HPC_2k.log",
        "log_format": "<LogId> <Node> <Component> <State> <Time> <Flag> <Content>",
        "filters": "([ |=])",
        "num_samples": 0,
        "k": 10,
        "nr_epochs": 3,
    },
    "Windows": {
        "log_file": "Windows/Windows_2k.log",
        "log_format": "<Date> <Time>, <Level>                  <Component>    <Content>",
        "filters": "([ ])",
        "num_samples": 0,
        "k": 95,
        "nr_epochs": 5,
    },
    "HealthApp": {
        "log_file": "HealthApp/HealthApp_2k.log",
        "log_format": "<Time>\|<Component>\|<Pid>\|<Content>",
        "filters": "([ ])",
        "num_samples": 0,
        "k": 100,
        "nr_epochs": 5,
    },
    "Mac": {
        "log_file": "Mac/Mac_2k.log",
        "log_format": "<Month>  <Date> <Time> <User> <Component>\[<PID>\]( \(<Address>\))?: <Content>",
        "filters": "([ ])|([\w-]+\.){2,}[\w-]+",
        "num_samples": 0,
        "k": 300,
        "nr_epochs": 10,
    },
    "Spark": {
        "log_file": "Spark/Spark_2k.log",
        "log_format": "<Date> <Time> <Level> <Component>: <Content>",
        "filters": "([ ])|(\d+\sB)|(\d+\sKB)|(\d+\.){3}\d+|\b[KGTM]?B\b|([\w-]+\.){2,}[\w-]+",
        "num_samples": 0,
        "k": 50,
        "nr_epochs": 3,
    },
}

bechmark_result = []
for dataset, setting in benchmark_settings.items():
    print("\n=== Evaluation on %s ===" % dataset)
    indir = os.path.join(input_dir, os.path.dirname(setting["log_file"]))
    log_file = os.path.basename(setting["log_file"])

    parser = LogParser(
        indir=indir,
        outdir=output_dir,
        filters=setting["filters"],
        k=setting["k"],
        log_format=setting["log_format"],
    )
    parser.parse(
        log_file, nr_epochs=setting["nr_epochs"], num_samples=setting["num_samples"]
    )

    F1_measure, accuracy = evaluator.evaluate(
        groundtruth=os.path.join(indir, log_file + "_structured.csv"),
        parsedresult=os.path.join(output_dir, log_file + "_structured.csv"),
    )
    bechmark_result.append([dataset, F1_measure, accuracy])


print("\n=== Overall evaluation results ===")
df_result = pd.DataFrame(bechmark_result, columns=["Dataset", "F1_measure", "Accuracy"])
df_result.set_index("Dataset", inplace=True)
print(df_result)
df_result.to_csv("NuLog_bechmark_result.csv", float_format="%.6f")

demo.py

0 → 100644
+23 −0
原始行号 差异行号 差异行
#!/usr/bin/env python

import sys
sys.path.append('../../')
from logparser.NuLog import LogParser

input_dir = '../../data/loghub_2k/HDFS/' # The input directory of log file
output_dir = 'demo_result/'  # The output directory of parsing results
log_file = 'HDFS_2k.log'  # The input log file name
log_format = '<Date> <Time> <Pid> <Level> <Component>: <Content>'  # HDFS log format
# Regular expression list for optional preprocessing (default: [])
regex = [
    r'blk_(|-)[0-9]+' , # block id
    r'(/|)([0-9]+\.){3}[0-9]+(:[0-9]+|)(:|)', # IP
    r'(?<=[^A-Za-z0-9])(\-?\+?\d+)(?=[^A-Za-z0-9])|[0-9]+$', # Numbers
]
filters = "(\s+blk_)|(:)|(\s)"
k = 15
nr_epochs = 5 # Number of epochs to run
num_samples = 0

parser = LogParser(log_format=log_format, indir=input_dir, outdir=output_dir, filters=filters, k=k)
parser.parse(log_file, nr_epochs=nr_epochs, num_samples=num_samples)