提交 ff9dfee7 编辑于 作者: openaiops's avatar openaiops
浏览文件

Initial commit

上级
加载中
加载中
加载中
加载中

LogCluster.py

0 → 100644
+173 −0
原始行号 差异行号 差异行
"""
Description : This file implements a wrapper around the original LogCluster code in perl
Author      : LogPAI team
License     : MIT
"""

import os
import pandas as pd
import re
import hashlib
from datetime import datetime
import subprocess


class LogParser():
    def __init__(self, indir, log_format, outdir, rex =[], support=None, rsupport=None, separator=None, lfilter=None, template=None,
                 lcfunc=None, syslog=None, wsize=None, csize=None, wweight=None, weightf=None, wfreq=None, wfilter=None,
                 wsearch=None, wrplace=None, wcfunc=None, outliers=None, readdump=None,
                 writedump=None, readwords=None, writewords=None):
        """
        Arguments
        ---------
            rsupport = < relative_support >
            separator = < word_separator_regexp >
            lfilter = < line_filter_regexp >
            template = < line_conversion_template >
            lcfunc = < perl_code >
            syslog = < syslog_facility >
            wsize = < word_sketch_size >
            csize = < candidate_sketch_size >
            wweight = < word_weight_threshold >
            weightf = < word_weight_function >
            wfreq = < word_frequency_threshold >
            wfilter = < word_filter_regexp >
            wsearch = < word_search_regexp >
            wreplace = < word_replace_string >
            wcfunc = < perl_code >
            outliers = < outlier_file >
            readdump = < dump_file >
            writedump = < dump_file >
            readwords = < word_file >
            writewords = < word_file >
        """
        self.path = indir
        self.log_format = log_format
        self.savepath = outdir
        self.paras = [support, rsupport, separator, lfilter, template,
                      lcfunc, syslog, wsize, csize, wweight, weightf, wfreq,
                      wfilter, wsearch, wrplace, wcfunc, outliers, readdump, writedump,
                      readwords, writewords]
        self.paranames = ["support", "rsupport", "separator", "lfilter", "template", "lcfunc", "syslog",
                          "wsize", "csize", "wweight", "weightf", "wfreq", "wfilter", "wsearch", "wrplace",
                          "wcfunc", "outliers", "readdump", "writedump", "readwords", "writewords"]
        self.perl_command = "perl {} --input {}".format(os.path.join(os.path.dirname(__file__), 'logcluster.pl'), "logcluster_input.log")
        for idx, para in enumerate(self.paras):
            if para:
                self.perl_command += " -{} {}".format(self.paranames[idx], para)
        self.perl_command += " > logcluster_output.txt"
        self.rex = rex

    def parse(self, filename):
        start_time = datetime.now()
        filepath = os.path.join(self.path, filename)
        print('Parsing file: ' + filepath)
        self.filename = filename
        headers, regex = self.generate_logformat_regex(self.log_format)
        self.df_log = self.log_to_dataframe(filepath, regex, headers, self.log_format)
        with open('logcluster_input.log', 'w') as fw:
            for line in self.df_log['Content']:
                if self.rex:
                    for currentRex in self.rex:
                        line = re.sub(currentRex, '', line)
                fw.write(line + '\n')
        try:
            print ("Run LogCluster command...\n>> {}".format(self.perl_command))
            subprocess.check_call(self.perl_command, shell=True)
        except:
            print("LogCluster run failed! Please check perl installed.\n")
            raise
        self.wirteResultToFile()
        os.remove("logcluster_input.log")
        os.remove("logcluster_output.txt")
        print('Parsing done. [Time taken: {!s}]'.format(datetime.now() - start_time))


    def wirteResultToFile(self):
        if not os.path.isdir(self.savepath):
            os.makedirs(self.savepath)

        EventIdx_hash = []
        LineID_EventIdx = {}
        Events = []
        Occurrences = []
        EventIdx = 0
        with open("logcluster_output.txt", 'r') as fr:
            for line in fr:
                line = line.split('\t')
                lineNums = line[1].split(',')
                Events.append(line[0].strip())
                EventIdx_hash.append(hashlib.md5(line[0].encode('utf-8')).hexdigest()[0:8])
                Occurrences.append(line[2].strip())
                for num in lineNums:
                    LineID_EventIdx[int(num)] = EventIdx
                EventIdx += 1

        EventTemplate = []
        EventId = []
        for i in range(self.df_log.shape[0]):
            i += 1
            e_idx = LineID_EventIdx.get(i, -1)
            if e_idx != -1 :
                EventTemplate.append(Events[e_idx])
                EventId.append(EventIdx_hash[e_idx])
            else:
                content = self.df_log.iloc[i-1]["Content"]
                EventTemplate.append(content)
                EventId.append(hashlib.md5(content.encode('utf-8')).hexdigest()[0:8])

        self.df_log["EventId"] = EventId
        self.df_log["EventTemplate"] = EventTemplate
        

        # eventDF = pd.DataFrame()
        # eventDF['EventId'] = EventIdx_hash
        # eventDF['EventTemplate'] = Events
        # eventDF['Occurrences'] = Occurrences

        # eventDF.to_csv(os.path.join(self.savepath, self.filename + '_templates.csv'), index=False)


        occ_dict = dict(self.df_log['EventTemplate'].value_counts())
        df_event = pd.DataFrame()
        df_event['EventTemplate'] = self.df_log['EventTemplate'].unique()
        df_event['Occurrences'] = df_event['EventTemplate'].map(occ_dict)
        df_event['EventId'] = df_event['EventTemplate'].map(lambda x: hashlib.md5(x.encode('utf-8')).hexdigest()[0:8])
        self.df_log.to_csv(os.path.join(self.savepath, self.filename + '_structured.csv'), index=False)

    def log_to_dataframe(self, log_file, regex, headers, logformat):
        """ Function to transform log file to dataframe 
        """
        log_messages = []
        linecount = 0
        with open(log_file, 'r') as fin:
            for line in fin.readlines():
                try:
                    match = regex.search(line.strip())
                    message = [match.group(header) for header in headers]
                    log_messages.append(message)
                    linecount += 1
                except Exception as e:
                    pass
        logdf = pd.DataFrame(log_messages, columns=headers)
        logdf.insert(0, 'LineId', None)
        logdf['LineId'] = [i + 1 for i in range(linecount)]
        return logdf

    def generate_logformat_regex(self, logformat):
        """ Function to generate regular expression to split log messages
        """
        headers = []
        splitters = re.split(r'(<[^<>]+>)', logformat)
        regex = ''
        for k in range(len(splitters)):
            if k % 2 == 0:
                splitter = re.sub(' +', '\s+', splitters[k])
                regex += splitter
            else:
                header = splitters[k].strip('<').strip('>')
                regex += '(?P<%s>.*?)' % header
                headers.append(header)
        regex = re.compile('^' + regex + '$')
        return headers, regex
 No newline at end of file

README.md

0 → 100644
+9 −0
原始行号 差异行号 差异行
# LogCluster

[LogCluster](http://ristov.github.io/logcluster/) is a Perl-based tool for log file clustering and mining line patterns from log files. The development of LogCluster was inspired by [SLCT](http://ristov.github.io/slct/), but LogCluster includes a number of novel features and data processing options. 

To provide a common interface for log parsing, we write a Python wrapper around the original [LogCluster source code in Perl](https://github.com/ristov/logcluster) (released under GPL license). This also eases our benchmarking experiments. The implementation has been tested on both Linux and Windows systems. Especially, [Strawberry Perl](http://strawberryperl.com/) was installed to run the Perl program on Windows.

Read more information about LogCluster from the following paper:

+ Risto Vaarandi, Mauno Pihelgas. [LogCluster - A Data Clustering and Pattern Mining Algorithm for Event Logs](http://ristov.github.io/publications/cnsm15-logcluster-web.pdf), *Proceedings of the 11th International Conference on Network and Service Management (CNSM)*, 2015.

__init__.py

0 → 100644
+2 −0
原始行号 差异行号 差异行
from LogCluster import *
 No newline at end of file

logcluster.pl

0 → 100644
+0 −0

添加文件。

预览已超出大小限制,变更已折叠。