Initial commit (de9728ee) · 提交 · AIOps-NanKai / model / NuLog

NuLog.py

0 → 100644

+0 −0

添加文件。

预览已超出大小限制，变更已折叠。

README.md

0 → 100644

+54 −0

原始行号	差异行号	差异行
		## NuLog

		Parsing semi-structured records with free-form text log messages into structured templates is the first and crucial step that enables further analysis. NuLog presents a novel parsing technique that utilizes a self-supervised learning model and formulates the parsing task as masked language modeling (MLM). In the process of parsing, the model extracts summarizations from the logs in the form of a vector embedding. This allows the coupling of the MLM as pre-training with a downstream anomaly detection task.

		Read more information about Brain from the following papers:

		+ Sasho Nedelkoski, Jasmin Bogatinovski, Alexander Acker, Jorge Cardoso, Odej Kao. [Self-Supervised Log Parsing](https://arxiv.org/abs/2003.07905), Joint European Conference on Machine Learning and Knowledge Discovery in Databases (ECML-PKDD), 2020.


		### Running

		Note that we modify NuLog to support both CPU and GPU devices. We run the experiments on a P100 GPU machine.

		Install the required enviornment:

		```
		pip install -r requirements.txt
		```

		Run the following script to start the demo:

		```
		python demo.py
		```

		Run the following script to execute the benchmark:

		```
		python benchmark.py
		```

		### Benchmark

		Running the benchmark script on Loghub_2k datasets, you could obtain the following results.

		\| Dataset \| F1_measure \| Accuracy \|
		\|:-----------:\|:----------\|:---------\|
		\| BGL \| 0.999779 \| 0.9785 \|
		\| Android \| 0.972805 \| 0.831 \|
		\| OpenStack \| 0.999856 \| 0.968 \|
		\| HDFS \| 0.99998 \| 0.9965 \|
		\| Apache \| 1 \| 1 \|
		\| HPC \| 0.994403 \| 0.9465 \|
		\| Windows \| 0.999983 \| 0.9945 \|
		\| HealthApp \| 0.996484 \| 0.8765 \|
		\| Mac \| 0.748933 \| 0.8165 \|
		\| Spark \| 0.999996 \| 0.998 \|

		### Citation

		:telescope: If you use our logparser tools or benchmarking results in your publication, please kindly cite the following papers.

		+ [ICSE'19] Jieming Zhu, Shilin He, Jinyang Liu, Pinjia He, Qi Xie, Zibin Zheng, Michael R. Lyu. [Tools and Benchmarks for Automated Log Parsing](https://arxiv.org/pdf/1811.03509.pdf). International Conference on Software Engineering (ICSE), 2019.
		+ [DSN'16] Pinjia He, Jieming Zhu, Shilin He, Jian Li, Michael R. Lyu. [An Evaluation Study on Log Parsing and Its Use in Log Mining](https://jiemingzhu.github.io/pub/pjhe_dsn2016.pdf). IEEE/IFIP International Conference on Dependable Systems and Networks (DSN), 2016.

init.py

0 → 100644

+1 −0

原始行号	差异行号	差异行
		from .NuLog import *

benchmark.py

0 → 100644

+141 −0

原始行号	差异行号	差异行
		# =========================================================================
		# Copyright (C) 2016-2023 LOGPAI (https://github.com/logpai).
		#
		# Licensed under the Apache License, Version 2.0 (the "License");
		# you may not use this file except in compliance with the License.
		# You may obtain a copy of the License at
		#
		# http://www.apache.org/licenses/LICENSE-2.0
		#
		# Unless required by applicable law or agreed to in writing, software
		# distributed under the License is distributed on an "AS IS" BASIS,
		# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
		# See the License for the specific language governing permissions and
		# limitations under the License.
		# =========================================================================


		import sys
		sys.path.append("../../")
		from logparser.NuLog import LogParser
		from logparser.utils import evaluator
		import os
		import pandas as pd


		input_dir = "../../data/loghub_2k/" # The input directory of log file
		output_dir = "NuLog_result/" # The output directory of parsing results


		benchmark_settings = {
		"BGL": {
		"log_file": "BGL/BGL_2k.log",
		"log_format": "<Label> <Timestamp> <Date> <Node> <Time> <NodeRepeat> <Type> <Component> <Level> <Content>",
		"filters": "([ \|:\|$\|$\|=\|,])\|(core.)\|(\.{2,})",
		"k": 50,
		"nr_epochs": 3,
		"num_samples": 0,
		},
		"Android": {
		"log_file": "Android/Android_2k.log",
		"log_format": "<Date> <Time> <Pid> <Tid> <Level> <Component>: <Content>",
		"filters": '([ \|:\|$\|$\|=\|,\|"\|\{\|\}\|@\|$\|\[\|\]\|\\|\|;])',
		"k": 25,
		"nr_epochs": 5,
		"num_samples": 5000,
		},
		"OpenStack": {
		"log_file": "OpenStack/OpenStack_2k.log",
		"log_format": "<Logrecord> <Date> <Time> <Pid> <Level> <Component> \[<ADDR>\] <Content>",
		"filters": '([ \|:\|$\|$\|"\|\{\|\}\|@\|$\|\[\|\]\|\\|\|;])',
		"k": 5,
		"nr_epochs": 6,
		"num_samples": 0,
		},
		"HDFS": {
		"log_file": "HDFS/HDFS_2k.log",
		"log_format": "<Date> <Time> <Pid> <Level> <Component>: <Content>",
		"filters": "(\s+blk_)\|(:)\|(\s)",
		"k": 15,
		"nr_epochs": 5,
		"num_samples": 0,
		},
		"Apache": {
		"log_file": "Apache/Apache_2k.log",
		"log_format": "\[<Time>\] \[<Level>\] <Content>",
		"filters": "([ ])",
		"k": 12,
		"nr_epochs": 5,
		"num_samples": 0,
		},
		"HPC": {
		"log_file": "HPC/HPC_2k.log",
		"log_format": "<LogId> <Node> <Component> <State> <Time> <Flag> <Content>",
		"filters": "([ \|=])",
		"num_samples": 0,
		"k": 10,
		"nr_epochs": 3,
		},
		"Windows": {
		"log_file": "Windows/Windows_2k.log",
		"log_format": "<Date> <Time>, <Level> <Component> <Content>",
		"filters": "([ ])",
		"num_samples": 0,
		"k": 95,
		"nr_epochs": 5,
		},
		"HealthApp": {
		"log_file": "HealthApp/HealthApp_2k.log",
		"log_format": "<Time>\\|<Component>\\|<Pid>\\|<Content>",
		"filters": "([ ])",
		"num_samples": 0,
		"k": 100,
		"nr_epochs": 5,
		},
		"Mac": {
		"log_file": "Mac/Mac_2k.log",
		"log_format": "<Month> <Date> <Time> <User> <Component>\[<PID>\]( $<Address>$)?: <Content>",
		"filters": "([ ])\|([\w-]+\.){2,}[\w-]+",
		"num_samples": 0,
		"k": 300,
		"nr_epochs": 10,
		},
		"Spark": {
		"log_file": "Spark/Spark_2k.log",
		"log_format": "<Date> <Time> <Level> <Component>: <Content>",
		"filters": "([ ])\|(\d+\sB)\|(\d+\sKB)\|(\d+\.){3}\d+\|\b[KGTM]?B\b\|([\w-]+\.){2,}[\w-]+",
		"num_samples": 0,
		"k": 50,
		"nr_epochs": 3,
		},
		}

		bechmark_result = []
		for dataset, setting in benchmark_settings.items():
		print("\n=== Evaluation on %s ===" % dataset)
		indir = os.path.join(input_dir, os.path.dirname(setting["log_file"]))
		log_file = os.path.basename(setting["log_file"])

		parser = LogParser(
		indir=indir,
		outdir=output_dir,
		filters=setting["filters"],
		k=setting["k"],
		log_format=setting["log_format"],
		)
		parser.parse(
		log_file, nr_epochs=setting["nr_epochs"], num_samples=setting["num_samples"]
		)

		F1_measure, accuracy = evaluator.evaluate(
		groundtruth=os.path.join(indir, log_file + "_structured.csv"),
		parsedresult=os.path.join(output_dir, log_file + "_structured.csv"),
		)
		bechmark_result.append([dataset, F1_measure, accuracy])


		print("\n=== Overall evaluation results ===")
		df_result = pd.DataFrame(bechmark_result, columns=["Dataset", "F1_measure", "Accuracy"])
		df_result.set_index("Dataset", inplace=True)
		print(df_result)
		df_result.to_csv("NuLog_bechmark_result.csv", float_format="%.6f")

demo.py

0 → 100644

+23 −0

原始行号	差异行号	差异行
		#!/usr/bin/env python

		import sys
		sys.path.append('../../')
		from logparser.NuLog import LogParser

		input_dir = '../../data/loghub_2k/HDFS/' # The input directory of log file
		output_dir = 'demo_result/' # The output directory of parsing results
		log_file = 'HDFS_2k.log' # The input log file name
		log_format = '<Date> <Time> <Pid> <Level> <Component>: <Content>' # HDFS log format
		# Regular expression list for optional preprocessing (default: [])
		regex = [
		r'blk_(\|-)[0-9]+' , # block id
		r'(/\|)([0-9]+\.){3}[0-9]+(:[0-9]+\|)(:\|)', # IP
		r'(?<=[^A-Za-z0-9])(\-?\+?\d+)(?=[^A-Za-z0-9])\|[0-9]+$', # Numbers
		]
		filters = "(\s+blk_)\|(:)\|(\s)"
		k = 15
		nr_epochs = 5 # Number of epochs to run
		num_samples = 0

		parser = LogParser(log_format=log_format, indir=input_dir, outdir=output_dir, filters=filters, k=k)
		parser.parse(log_file, nr_epochs=nr_epochs, num_samples=num_samples)