加载中 .gitignore 0 → 100644 +129 −0 原始行号 差异行号 差异行 # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ BGL&HDFS dataset and Methods of data processing/BGL/01_handle BGL dataset.py 0 → 100644 +30 −0 原始行号 差异行号 差异行 import pandas as pd import copy from collections import Counter pre_data = pd.read_csv('./bgl/BGL_sequence.csv') pre_data = pre_data.values data = [] label = [] for i in range(0,len(pre_data)): value = [] division = pre_data[i][0].split(",") if division[0] != '[]': for j in range(0,len(division)): if '[' in division[j] and ']' not in division[j]: value.append(int(division[j][3:-1])) elif '[' in division[j] and ']' in division[j]: value.append(int(division[j][3:-2])) elif '[' not in division[j] and ']' in division[j]: value.append(int(division[j][3:-2])) else: value.append(int(division[j][3:-1])) else: value.append(0) data.append(value) label.append(int(pre_data[i][1])) print(Counter(label)) pd.DataFrame(data=data).to_csv('../data/bgl_data.csv', index=False, header=False) pd.DataFrame(data=label).to_csv('../data/bgl_label.csv', index=False, header=False) BGL&HDFS dataset and Methods of data processing/BGL/02_handle BGL templates.py 0 → 100644 +123 −0 原始行号 差异行号 差异行 import numpy as np from keras.layers import Input,Embedding,Lambda from keras.models import Model,load_model from sklearn.feature_extraction.text import TfidfVectorizer import keras.backend as K import pandas as pd import json word_size = 300 # 词向量维度 window = 5 # 窗口大小 nb_negative = 15 # 随机负采样的样本数 min_count = 0 # 频数少于min_count的词将会被抛弃 nb_worker = 1 # 读取数据的并发数 nb_epoch = 20 # 迭代次数,由于使用了adam,迭代次数1~2次效果就相当不错 subsample_t = 1e-5 # 词频大于subsample_t的词语,会被降采样,这是提高速度和词向量质量的有效方案 nb_sentence_per_batch = 30 # 目前是以句子为单位作为batch,多少个句子作为一个batch(这样才容易估计训练过程中的steps参数,另外注意,样本数是正比于字数的。) def getdata(): data = pd.read_csv('./bgl/templates.csv').values templates = data[:,1] label = data[:,0] sentences = [] for s in templates: sentences.append(s.split()) return label,templates, sentences def bulid_dic(sentences): # 建立各种字典 words = {} # 词频表 nb_sentence = 0 # 总句子数 total = 0. # 总词频 for d in sentences: nb_sentence += 1 for w in d: if w not in words: words[w] = 0 words[w] += 1 total += 1 if nb_sentence % 100 == 0: pass words = {i: j for i, j in words.items() if j >= min_count} # 截断词频 id2word = {i + 1: j for i, j in enumerate(words)} # id到词语的映射,0表示UNK word2id = {j: i for i, j in id2word.items()} # 词语到id的映射 nb_word = len(words) + 1 # 总词数(算上填充符号0) subsamples = {i: j / total for i, j in words.items() if j / total > subsample_t} subsamples = {i: subsample_t / j + (subsample_t / j) ** 0.5 for i, j in subsamples.items()} # 这个降采样公式,是按照word2vec的源码来的 subsamples = {word2id[i]: j for i, j in subsamples.items() if j < 1.} # 降采样表 return nb_sentence, id2word, word2id, nb_word, subsamples def data_generator(word2id, subsamples, data): # 训练数据生成器 x, y = [], [] _ = 0 for d in data: d = [0] * window + [word2id[w] for w in d if w in word2id] + [0] * window r = np.random.random(len(d)) for i in range(window, len(d) - window): if d[i] in subsamples and r[i] > subsamples[d[i]]: # 满足降采样条件的直接跳过 continue x.append(d[i - window:i] + d[i + 1:i + 1 + window]) y.append([d[i]]) _ += 1 if _ == nb_sentence_per_batch: x, y = np.array(x), np.array(y) z = np.zeros((len(x), 1)) return [x, y], z def build_w2vm(word_size, window, nb_word, nb_negative): K.clear_session() # 清除之前的模型,避免压满内存 # CBOW输入 input_words = Input(shape=(window * 2,), dtype='int32') input_vecs = Embedding(nb_word, word_size, name='word2vec')(input_words) input_vecs_sum = Lambda(lambda x: K.sum(x, axis=1))(input_vecs) # CBOW模型,直接将上下文词向量求和 # 构造随机负样本,与目标组成抽样 target_word = Input(shape=(1,), dtype='int32') negatives = Lambda(lambda x: K.random_uniform((K.shape(x)[0], nb_negative), 0, nb_word, 'int32'))(target_word) samples = Lambda(lambda x: K.concatenate(x))([target_word, negatives]) # 构造抽样,负样本随机抽。负样本也可能抽到正样本,但概率小。 # 只在抽样内做Dense和softmax softmax_weights = Embedding(nb_word, word_size, name='W')(samples) softmax_biases = Embedding(nb_word, 1, name='b')(samples) softmax = Lambda(lambda x: K.softmax((K.batch_dot(x[0], K.expand_dims(x[1], 2)) + x[2])[:, :, 0]) )([softmax_weights, input_vecs_sum, softmax_biases]) # 用Embedding层存参数,用K后端实现矩阵乘法,以此复现Dense层的功能 # 留意到,我们构造抽样时,把目标放在了第一位,也就是说,softmax的目标id总是0,这可以从data_generator中的z变量的写法可以看出 model = Model(inputs=[input_words, target_word], outputs=softmax) model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy']) # 请留意用的是sparse_categorical_crossentropy而不是categorical_crossentropy model.summary() return model if __name__ == '__main__': #word2vector label,templates, sentences = getdata() nb_sentence, id2word, word2id, nb_word, subsamples = bulid_dic(sentences) ipt, opt = data_generator(word2id, subsamples, templates) # 构造训练数据 model = build_w2vm(word_size, window, nb_word, nb_negative) # 搭模型 model.fit(ipt, opt,steps_per_epoch=int(nb_sentence / nb_sentence_per_batch),epochs=nb_epoch) model.save('word2vec.h5') embeddings = model.get_weights()[0] normalized_embeddings = embeddings / (embeddings**2).sum(axis=1).reshape((-1,1))**0.5 #词向量归一化,即将模定为1embeddings[0]embeddings[0] #保存句向量 vector_json={} for i in range(0,len(sentences)): vector = [] for ii in sentences[i]: vector.append(normalized_embeddings[word2id[ii]]) vector_json.update({(i+1):list(np.float64(np.sum(vector,axis=0)))}) json_str = json.dumps(vector_json) with open('./bgl/bgl_templates.json', 'w') as json_file: json_file.write(json_str) BGL&HDFS dataset and Methods of data processing/BGL/03_1_constructing sequences by length.py 0 → 100644 +48 −0 原始行号 差异行号 差异行 import pandas as pd import numpy as np from collections import Counter sequence_length = 300 data = pd.read_csv('./bgl/BGL_100k_structured.csv') data = data.values pre_label = data[:,0] logs = data[:,2] # 处理datas for i in range(0,len(logs)): logs[i] = int(logs[i][1:]) #处理label label = [] for l in range(0,len(pre_label)): if pre_label[l]=='-': label.append(0) else: label.append(1) logs_data = [] for j in range(len(logs) - sequence_length): logs_data.append(logs[j: j + sequence_length]) reshaped_logs = np.array(logs_data).astype('float64') logs_label = [] for k in range(len(label) - sequence_length): logs_label.append(label[k: k + sequence_length]) reshaped_label = np.array(logs_label).astype('float64') # reshaped_label = logs_label result_label = [] for m in range(0,len(reshaped_label)): if 1 in reshaped_label[m]: result_label.append(1) else: result_label.append(0) end_logs = [] end_label = [] for n in range(0,len(result_label)): # if n%10 == 0: end_logs.append(reshaped_logs[n]) end_label.append(result_label[n]) print(Counter(end_label)) pd.DataFrame(data=end_logs).to_csv('../data/bgl_data.csv', index=False, header=False) pd.DataFrame(data=end_label).to_csv('../data/bgl_label.csv', index=False, header=False) No newline at end of file BGL&HDFS dataset and Methods of data processing/BGL/03_2_Transformation to a supervised data set under time windows.py 0 → 100644 +37 −0 原始行号 差异行号 差异行 import pandas as pd import numpy as np np.set_printoptions(threshold=np.inf) import csv import copy from collections import Counter pre_data = pd.read_csv('./bgl/BGL_sequence.csv') pre_data = pre_data.values data = [] label = [] count = 0 co = 0 for i in range(0,len(pre_data)): value = [] division = pre_data[i][0].split(",") if division[0] != '[]': for j in range(0,len(division)): if '[' in division[j] and ']' not in division[j]: value.append(int(division[j][3:-1])) elif '[' in division[j] and ']' in division[j]: value.append(int(division[j][3:-2])) elif '[' not in division[j] and ']' in division[j]: value.append(int(division[j][3:-2])) else: value.append(int(division[j][3:-1])) line = str(np.array(value))[2:-1].split(' ') data.append(str(np.array(value))[1:-1]) label.append(int(pre_data[i][1])) co = co+1 else: count = count+1 print(co) print(count) pd.DataFrame({'Sequence':data,'label':label}).to_csv('./data/bgl_time_data.csv',index=False, header=False) 加载中
.gitignore 0 → 100644 +129 −0 原始行号 差异行号 差异行 # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/
BGL&HDFS dataset and Methods of data processing/BGL/01_handle BGL dataset.py 0 → 100644 +30 −0 原始行号 差异行号 差异行 import pandas as pd import copy from collections import Counter pre_data = pd.read_csv('./bgl/BGL_sequence.csv') pre_data = pre_data.values data = [] label = [] for i in range(0,len(pre_data)): value = [] division = pre_data[i][0].split(",") if division[0] != '[]': for j in range(0,len(division)): if '[' in division[j] and ']' not in division[j]: value.append(int(division[j][3:-1])) elif '[' in division[j] and ']' in division[j]: value.append(int(division[j][3:-2])) elif '[' not in division[j] and ']' in division[j]: value.append(int(division[j][3:-2])) else: value.append(int(division[j][3:-1])) else: value.append(0) data.append(value) label.append(int(pre_data[i][1])) print(Counter(label)) pd.DataFrame(data=data).to_csv('../data/bgl_data.csv', index=False, header=False) pd.DataFrame(data=label).to_csv('../data/bgl_label.csv', index=False, header=False)
BGL&HDFS dataset and Methods of data processing/BGL/02_handle BGL templates.py 0 → 100644 +123 −0 原始行号 差异行号 差异行 import numpy as np from keras.layers import Input,Embedding,Lambda from keras.models import Model,load_model from sklearn.feature_extraction.text import TfidfVectorizer import keras.backend as K import pandas as pd import json word_size = 300 # 词向量维度 window = 5 # 窗口大小 nb_negative = 15 # 随机负采样的样本数 min_count = 0 # 频数少于min_count的词将会被抛弃 nb_worker = 1 # 读取数据的并发数 nb_epoch = 20 # 迭代次数,由于使用了adam,迭代次数1~2次效果就相当不错 subsample_t = 1e-5 # 词频大于subsample_t的词语,会被降采样,这是提高速度和词向量质量的有效方案 nb_sentence_per_batch = 30 # 目前是以句子为单位作为batch,多少个句子作为一个batch(这样才容易估计训练过程中的steps参数,另外注意,样本数是正比于字数的。) def getdata(): data = pd.read_csv('./bgl/templates.csv').values templates = data[:,1] label = data[:,0] sentences = [] for s in templates: sentences.append(s.split()) return label,templates, sentences def bulid_dic(sentences): # 建立各种字典 words = {} # 词频表 nb_sentence = 0 # 总句子数 total = 0. # 总词频 for d in sentences: nb_sentence += 1 for w in d: if w not in words: words[w] = 0 words[w] += 1 total += 1 if nb_sentence % 100 == 0: pass words = {i: j for i, j in words.items() if j >= min_count} # 截断词频 id2word = {i + 1: j for i, j in enumerate(words)} # id到词语的映射,0表示UNK word2id = {j: i for i, j in id2word.items()} # 词语到id的映射 nb_word = len(words) + 1 # 总词数(算上填充符号0) subsamples = {i: j / total for i, j in words.items() if j / total > subsample_t} subsamples = {i: subsample_t / j + (subsample_t / j) ** 0.5 for i, j in subsamples.items()} # 这个降采样公式,是按照word2vec的源码来的 subsamples = {word2id[i]: j for i, j in subsamples.items() if j < 1.} # 降采样表 return nb_sentence, id2word, word2id, nb_word, subsamples def data_generator(word2id, subsamples, data): # 训练数据生成器 x, y = [], [] _ = 0 for d in data: d = [0] * window + [word2id[w] for w in d if w in word2id] + [0] * window r = np.random.random(len(d)) for i in range(window, len(d) - window): if d[i] in subsamples and r[i] > subsamples[d[i]]: # 满足降采样条件的直接跳过 continue x.append(d[i - window:i] + d[i + 1:i + 1 + window]) y.append([d[i]]) _ += 1 if _ == nb_sentence_per_batch: x, y = np.array(x), np.array(y) z = np.zeros((len(x), 1)) return [x, y], z def build_w2vm(word_size, window, nb_word, nb_negative): K.clear_session() # 清除之前的模型,避免压满内存 # CBOW输入 input_words = Input(shape=(window * 2,), dtype='int32') input_vecs = Embedding(nb_word, word_size, name='word2vec')(input_words) input_vecs_sum = Lambda(lambda x: K.sum(x, axis=1))(input_vecs) # CBOW模型,直接将上下文词向量求和 # 构造随机负样本,与目标组成抽样 target_word = Input(shape=(1,), dtype='int32') negatives = Lambda(lambda x: K.random_uniform((K.shape(x)[0], nb_negative), 0, nb_word, 'int32'))(target_word) samples = Lambda(lambda x: K.concatenate(x))([target_word, negatives]) # 构造抽样,负样本随机抽。负样本也可能抽到正样本,但概率小。 # 只在抽样内做Dense和softmax softmax_weights = Embedding(nb_word, word_size, name='W')(samples) softmax_biases = Embedding(nb_word, 1, name='b')(samples) softmax = Lambda(lambda x: K.softmax((K.batch_dot(x[0], K.expand_dims(x[1], 2)) + x[2])[:, :, 0]) )([softmax_weights, input_vecs_sum, softmax_biases]) # 用Embedding层存参数,用K后端实现矩阵乘法,以此复现Dense层的功能 # 留意到,我们构造抽样时,把目标放在了第一位,也就是说,softmax的目标id总是0,这可以从data_generator中的z变量的写法可以看出 model = Model(inputs=[input_words, target_word], outputs=softmax) model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy']) # 请留意用的是sparse_categorical_crossentropy而不是categorical_crossentropy model.summary() return model if __name__ == '__main__': #word2vector label,templates, sentences = getdata() nb_sentence, id2word, word2id, nb_word, subsamples = bulid_dic(sentences) ipt, opt = data_generator(word2id, subsamples, templates) # 构造训练数据 model = build_w2vm(word_size, window, nb_word, nb_negative) # 搭模型 model.fit(ipt, opt,steps_per_epoch=int(nb_sentence / nb_sentence_per_batch),epochs=nb_epoch) model.save('word2vec.h5') embeddings = model.get_weights()[0] normalized_embeddings = embeddings / (embeddings**2).sum(axis=1).reshape((-1,1))**0.5 #词向量归一化,即将模定为1embeddings[0]embeddings[0] #保存句向量 vector_json={} for i in range(0,len(sentences)): vector = [] for ii in sentences[i]: vector.append(normalized_embeddings[word2id[ii]]) vector_json.update({(i+1):list(np.float64(np.sum(vector,axis=0)))}) json_str = json.dumps(vector_json) with open('./bgl/bgl_templates.json', 'w') as json_file: json_file.write(json_str)
BGL&HDFS dataset and Methods of data processing/BGL/03_1_constructing sequences by length.py 0 → 100644 +48 −0 原始行号 差异行号 差异行 import pandas as pd import numpy as np from collections import Counter sequence_length = 300 data = pd.read_csv('./bgl/BGL_100k_structured.csv') data = data.values pre_label = data[:,0] logs = data[:,2] # 处理datas for i in range(0,len(logs)): logs[i] = int(logs[i][1:]) #处理label label = [] for l in range(0,len(pre_label)): if pre_label[l]=='-': label.append(0) else: label.append(1) logs_data = [] for j in range(len(logs) - sequence_length): logs_data.append(logs[j: j + sequence_length]) reshaped_logs = np.array(logs_data).astype('float64') logs_label = [] for k in range(len(label) - sequence_length): logs_label.append(label[k: k + sequence_length]) reshaped_label = np.array(logs_label).astype('float64') # reshaped_label = logs_label result_label = [] for m in range(0,len(reshaped_label)): if 1 in reshaped_label[m]: result_label.append(1) else: result_label.append(0) end_logs = [] end_label = [] for n in range(0,len(result_label)): # if n%10 == 0: end_logs.append(reshaped_logs[n]) end_label.append(result_label[n]) print(Counter(end_label)) pd.DataFrame(data=end_logs).to_csv('../data/bgl_data.csv', index=False, header=False) pd.DataFrame(data=end_label).to_csv('../data/bgl_label.csv', index=False, header=False) No newline at end of file
BGL&HDFS dataset and Methods of data processing/BGL/03_2_Transformation to a supervised data set under time windows.py 0 → 100644 +37 −0 原始行号 差异行号 差异行 import pandas as pd import numpy as np np.set_printoptions(threshold=np.inf) import csv import copy from collections import Counter pre_data = pd.read_csv('./bgl/BGL_sequence.csv') pre_data = pre_data.values data = [] label = [] count = 0 co = 0 for i in range(0,len(pre_data)): value = [] division = pre_data[i][0].split(",") if division[0] != '[]': for j in range(0,len(division)): if '[' in division[j] and ']' not in division[j]: value.append(int(division[j][3:-1])) elif '[' in division[j] and ']' in division[j]: value.append(int(division[j][3:-2])) elif '[' not in division[j] and ']' in division[j]: value.append(int(division[j][3:-2])) else: value.append(int(division[j][3:-1])) line = str(np.array(value))[2:-1].split(' ') data.append(str(np.array(value))[1:-1]) label.append(int(pre_data[i][1])) co = co+1 else: count = count+1 print(co) print(count) pd.DataFrame({'Sequence':data,'label':label}).to_csv('./data/bgl_time_data.csv',index=False, header=False)