Initial commit (15b7ed20) · 提交 · AIOps-NanKai / model / TraceRCA

.gitignore

0 → 100644

+1 −0

原始行号	差异行号	差异行
		.DS_Store

Makefile

0 → 100644

+213 −0

原始行号	差异行号	差异行
		# Run all experiments
		ROOT := /TraceAnalysis
		OUTPUT := $(ROOT)/output
		# OUTPUT := $(ROOT)/tracerca-exp/data/dockeroutput
		# ORIGIN_DATA_DIR := $(ROOT)/tracerca-exp/data/
		ORIGIN_DATA_DIR := $(ROOT)/tracerca-exp/data/
		ROOT_CAUSE_DIR := $(ORIGIN_DATA_DIR)/root_causes/
		SCRIPT_DIR := $(ROOT)/tracerca-exp
		CONFIG_SCRIPTS := $(ROOT)/tracerca-exp/trainticket_config.py
		UPDATE_CACHE_FLAG :=


		# Input data should be prepares manually
		TEST_FILES = $(shell ls $(ORIGIN_DATA_DIR)/test/*.pkl)
		NORMAL_TRAIN_FILES = $(shell ls $(ORIGIN_DATA_DIR)/normal/*.pkl)
		ABNORMAL_TRAIN_FILES = $(shell ls ${ORIGIN_DATA_DIR}/train/*.pkl)
		ALL_TRAIN_FILES = $(NORMAL_TRAIN_FILES) $(ABNORMAL_TRAIN_FILES)

		# hyperparameters
		DROP_SERVICE = 0
		DROP_FAULT_TYPE = 0
		SUPPORT = 0.1
		SIGMA = 1
		FISHER = 3
		K = 100


		INVO_TEST_FILE_RESULTS = $(addprefix $(OUTPUT)/trainticket_anomaly_detection.test/,$(addsuffix .invo.result.pkl.$(SIGMA).$(FISHER),$(basename $(notdir $(TEST_FILES)))))
		TRACE_TEST_FILE_RESULTS = $(addprefix $(OUTPUT)/trainticket_anomaly_detection.test/,$(addsuffix .trace.result.pkl.$(DROP_SERVICE).$(DROP_FAULT_TYPE),$(basename $(notdir $(TEST_FILES)))))

		ANOMALY_DETECTION_RESULT = $(OUTPUT)/trainticket.anomaly_detection.result.csv.$(DROP_SERVICE).$(DROP_FAULT_TYPE).$(SIGMA).$(FISHER)
		FAULT_LOCALIZATION_RESULT = $(OUTPUT)/trainticket.root_cause_localization.result.csv.$(DROP_SERVICE).$(DROP_FAULT_TYPE).$(SUPPORT).$(K)

		EFFECT_OF_TRACE_LOCALIZATION_RESULT = $(OUTPUT)/trainticket.root_cause_localization.effect_of_trace.result.csv.$(DROP_SERVICE).$(DROP_FAULT_TYPE)

		ANOMALY_DETECTION_MODEL = $(OUTPUT)/trainticket_anomaly_detection.models.$(DROP_SERVICE).$(DROP_FAULT_TYPE)
		TRACE_HISTORICAL_DATA = $(OUTPUT)/trainticket_trace_encoded/trainticket_historical_all.trace.$(DROP_SERVICE).$(DROP_FAULT_TYPE).npz
		INVO_HISTORICAL_DATA = $(OUTPUT)/trainticket_invo_encoded/trainticket_historical_normal.invo.pkl

		ASSOCIATION_RULE_MINING_TEST_FILE_RESULTS = $(addprefix $(OUTPUT)/trainticket_root_cause_localization/,$(addsuffix .association_rule_mining.result.pkl.$(SUPPORT).$(K),$(basename $(notdir $(TEST_FILES)))))
		PAGERANK_TEST_FILE_RESULTS = $(addprefix $(OUTPUT)/trainticket_root_cause_localization/,$(addsuffix .pagerank.result.pkl,$(basename $(notdir $(TEST_FILES)))))
		MEPFL_TEST_FILE_RESULTS = $(addprefix $(OUTPUT)/trainticket_root_cause_localization/,$(addsuffix .MEPFL.result.pkl.$(DROP_SERVICE).$(DROP_FAULT_TYPE),$(basename $(notdir $(TEST_FILES)))))
		RCSF_TEST_FILE_RESULTS = $(addprefix $(OUTPUT)/trainticket_root_cause_localization/,$(addsuffix .RCSF.result.pkl,$(basename $(notdir $(TEST_FILES)))))
		MICROSCOPE_TEST_FILE_RESULTS = $(addprefix $(OUTPUT)/trainticket_root_cause_localization/,$(addsuffix .microscope.result.pkl,$(basename $(notdir $(TEST_FILES)))))
		LOCALIZATION_MODEL = $(OUTPUT)/trainticket_localization.models.$(DROP_SERVICE).$(DROP_FAULT_TYPE)

		ASSOCIATION_RULE_MINING_TEST_FILE_RESULTS_EFFECT_OF_TRACE = $(addprefix $(OUTPUT)/trainticket_root_cause_localization/,$(addsuffix _effect_of_trace_type1.association_rule_mining.result.pkl.$(SUPPROT),$(basename $(notdir $(TEST_FILES)))))

		# keep all temporary files
		.SECONDARY:


		.PHONY: target
		target: $(ANOMALY_DETECTION_RESULT) $(FAULT_LOCALIZATION_RESULT) ;

		.PHONY: effect-of-trace
		effect-of-trace: $(EFFECT_OF_TRACE_LOCALIZATION_RESULT) ;

		.PHONY: models
		models: $(LOCALIZATION_MODEL) $(ANOMALY_DETECTION_MODEL) ;

		.PHONY: localization
		localization: $(FAULT_LOCALIZATION_RESULT) ;

		.PHONY: ad
		ad: $(ANOMALY_DETECTION_RESULT) ;


		.PHONY: dataset-summary
		dataset-summary: $(addprefix $(OUTPUT)/trainticket_trace_encoded/,$(addsuffix .trace.$(DROP_SERVICE).$(DROP_FAULT_TYPE).npz,$(basename $(notdir $(TEST_FILES))))) $(addprefix $(OUTPUT)/trainticket_invo_encoded/,$(addsuffix .invo.pkl,$(basename $(notdir $(TEST_FILES))))) $(INVO_HISTORICAL_DATA) $(TRACE_HISTORICAL_DATA) $(SCRIPT_DIR)/run_dataset_summary.py
		echo $(filter %.invo.pkl,$INVO_HISTORICAL_DATA)
		python run_dataset_summary.py \
		$(addprefix -i ",$(addsuffix ",$(filter %.trace.$(DROP_SERVICE).$(DROP_FAULT_TYPE).npz,$^))) \
		$(addprefix -i ",$(addsuffix ",$(filter %.invo.pkl,$^)))


		.PHONY: prepare-all-files
		prepare-all-files:
		mkdir $(ORIGIN_DATA_DIR)/all/ \|\| echo exists
		# rm $(ORIGIN_DATA_DIR)/all/*.pkl \|\| echo OK
		ln -sf $(ORIGIN_DATA_DIR)/test/* $(ORIGIN_DATA_DIR)/all/


		.PHONY: debug
		debug:
		echo TEST_FILES: $(TEST_FILES)
		echo NORMAL_TRAIN_FILES: $(NORMAL_TRAIN_FILES)
		echo ABNORMAL_TRAIN_FILES: $(ABNORMAL_TRAIN_FILES)
		echo ALL_TRAIN_FILES: $(ALL_TRAIN_FILES)
		echo TEST_FILE_RESULTS: $(TEST_FILE_RESULTS)
		echo HISTORICAL_DATA: $(HISTORICAL_DATA)
		echo ANOMALY_DETECTION_RESULT $(ANOMALY_DETECTION_RESULT)


		# anomaly detection result
		$(ANOMALY_DETECTION_RESULT):$(INVO_TEST_FILE_RESULTS) $(TRACE_TEST_FILE_RESULTS) $(SCRIPT_DIR)/run_anomaly_detection_collect_result.py $(CONFIG_SCRIPTS)
		python run_anomaly_detection_collect_result.py \
		$(addprefix -i ",$(addsuffix ",$(INVO_TEST_FILE_RESULTS))) \
		$(addprefix -t ",$(addsuffix ",$(TRACE_TEST_FILE_RESULTS))) \
		-o "$@"

		$(ANOMALY_DETECTION_MODEL):$(TRACE_HISTORICAL_DATA) $(INVO_HISTORICAL_DATA) $(SCRIPT_DIR)/run_anomaly_detection_prepare_model.py
		python run_anomaly_detection_prepare_model.py -i $(word 2,$^) -t $(word 1,$^) -o $@


		$(OUTPUT)/trainticket_anomaly_detection.test/%.invo.result.pkl.$(SIGMA).$(FISHER): $(OUTPUT)/trainticket_invo_encoded/%.invo.pkl $(INVO_HISTORICAL_DATA) $(OUTPUT)/trainticket_anomaly_detection.test/%.useful_features.$(FISHER) $(ANOMALY_DETECTION_MODEL) $(SCRIPT_DIR)/run_anomaly_detection_invo.py
		python run_anomaly_detection_invo.py -i $(word 1,$^) -o $@ -h $(word 2,$^) -u $(word 3,$^) --cache $(ANOMALY_DETECTION_MODEL) --threshold $(SIGMA)

		$(OUTPUT)/trainticket_anomaly_detection.test/%.useful_features.$(FISHER): $(OUTPUT)/trainticket_invo_encoded/%.invo.pkl $(INVO_HISTORICAL_DATA) $(SCRIPT_DIR)/run_selecting_features.py
		python run_selecting_features.py -i $(word 1,$^) -o $@ -h $(word 2,$^) --fisher $(FISHER)

		$(OUTPUT)/trainticket_anomaly_detection.test/%.trace.result.pkl.$(DROP_SERVICE).$(DROP_FAULT_TYPE): $(OUTPUT)/trainticket_trace_encoded/%.trace.$(DROP_SERVICE).$(DROP_FAULT_TYPE).npz $(TRACE_HISTORICAL_DATA) $(ANOMALY_DETECTION_MODEL) $(SCRIPT_DIR)/run_anomaly_detection_trace.py
		python run_anomaly_detection_trace.py -i $(word 1,$^) -o $@ -h $(word 2,$^) \
		-m $(word 3,$^) $(UPDATE_CACHE_FLAG)



		$(OUTPUT)/trainticket_invo_encoded/%.invo.pkl: $(ORIGIN_DATA_DIR)/all/%.pkl $(SCRIPT_DIR)/run_invo_encoding.py
		python run_invo_encoding.py -i $(word 1,$^) -o $(word 1,$@)

		$(OUTPUT)/trainticket_trace_encoded/%.trace.$(DROP_SERVICE).$(DROP_FAULT_TYPE).npz: $(ORIGIN_DATA_DIR)/all/%.pkl $(SCRIPT_DIR)/run_trace_encoding.py
		python run_trace_encoding.py -i $(word 1,$^) -o $(word 1,$@) --drop-fault-type $(DROP_FAULT_TYPE) --drop-service $(DROP_SERVICE)

		$(ORIGIN_DATA_DIR)/all/trainticket_historical_normal.pkl: $(NORMAL_TRAIN_FILES) $(SCRIPT_DIR)/run_concatenate.py
		python run_concatenate.py $(addprefix -i ",$(addsuffix ",$(NORMAL_TRAIN_FILES))) -o $@

		$(ORIGIN_DATA_DIR)/all/trainticket_historical_all.pkl: $(ALL_TRAIN_FILES) $(SCRIPT_DIR)/run_concatenate.py
		python run_concatenate.py $(addprefix -i ",$(addsuffix ",$(ALL_TRAIN_FILES))) -o $@ --add-root-cause

		# ROOT CAUSE LOCALIZATION
		$(FAULT_LOCALIZATION_RESULT): $(ASSOCIATION_RULE_MINING_TEST_FILE_RESULTS) $(PAGERANK_TEST_FILE_RESULTS) $(MEPFL_TEST_FILE_RESULTS) $(MICROSCOPE_TEST_FILE_RESULTS) $(RCSF_TEST_FILE_RESULTS) $(SCRIPT_DIR)/run_localization_collect.py
		python run_localization_collect.py \
		$(addprefix -i ",$(addsuffix ",$(ASSOCIATION_RULE_MINING_TEST_FILE_RESULTS))) \
		$(addprefix -i ",$(addsuffix ",$(MEPFL_TEST_FILE_RESULTS))) \
		$(addprefix -i ",$(addsuffix ",$(PAGERANK_TEST_FILE_RESULTS))) \
		$(addprefix -i ",$(addsuffix ",$(RCSF_TEST_FILE_RESULTS))) \
		$(addprefix -i ",$(addsuffix ",$(MICROSCOPE_TEST_FILE_RESULTS))) \
		-r $(ROOT_CAUSE_DIR) \
		-o $@

		#$(FAULT_LOCALIZATION_RESULT): $(MEPFL_TEST_FILE_RESULTS) $(SCRIPT_DIR)/run_localization_collect.py
		# python run_localization_collect.py \
		# $(addprefix -i ",$(addsuffix ",$(MEPFL_TEST_FILE_RESULTS))) \
		# -r $(ROOT_CAUSE_DIR) \
		# -o $@

		$(EFFECT_OF_TRACE_LOCALIZATION_RESULT): $(SCRIPT_DIR)/run_effect_of_trace_localization_collect.py
		python run_effect_of_trace_localization_collect.py -o $(EFFECT_OF_TRACE_LOCALIZATION_RESULT)

		$(OUTPUT)/trainticket_anomaly_detection.test/%_effect_of_trace_type1.invo.result.pkl: $(OUTPUT)/trainticket_anomaly_detection.test/%.invo.result.pkl $(SCRIPT_DIR)/run_effect_of_trace_inject.py
		python run_effect_of_trace_inject.py -i $(word 1,$^) -o $@ -r $(OUTPUT)/trainticket_effect_of_trace.root_cause/


		$(LOCALIZATION_MODEL): $(TRACE_HISTORICAL_DATA) $(SCRIPT_DIR)/run_localization_prepare_model.py
		python run_localization_prepare_model.py -t $(word 1,$^) -o $@

		$(OUTPUT)/trainticket_root_cause_localization/%.association_rule_mining.result.pkl.$(SUPPORT).$(K):$(OUTPUT)/trainticket_anomaly_detection.test/%.invo.result.pkl.$(SIGMA).$(FISHER) $(SCRIPT_DIR)/run_localization_association_rule_mining.py $(shell ls $(SCRIPT_DIR)/association_rule_mining/*.py) $(CONFIG_SCRIPTS)
		python run_localization_association_rule_mining.py -i $(word 1,$^) -o $@ \
		--min-support-rate $(SUPPORT) --quiet --k $(K)

		$(OUTPUT)/trainticket_root_cause_localization/%.pagerank.result.pkl:$(OUTPUT)/trainticket_anomaly_detection.test/%.invo.result.pkl.$(SIGMA).$(FISHER) $(SCRIPT_DIR)/run_localization_pagerank.py
		python run_localization_pagerank.py -i $(word 1,$^) -o $@

		$(OUTPUT)/trainticket_root_cause_localization/%.RCSF.result.pkl:$(OUTPUT)/trainticket_anomaly_detection.test/%.invo.result.pkl.$(SIGMA).$(FISHER) $(SCRIPT_DIR)/run_localization_RCSF.py
		python run_localization_RCSF.py -i $(word 1,$^) -o $@

		$(OUTPUT)/trainticket_root_cause_localization/%.MEPFL.result.pkl.$(DROP_SERVICE).$(DROP_FAULT_TYPE):$(OUTPUT)/trainticket_trace_encoded/%.trace.$(DROP_SERVICE).$(DROP_FAULT_TYPE).npz $(LOCALIZATION_MODEL) $(SCRIPT_DIR)/run_localization_MEPFL.py
		python run_localization_MEPFL.py -i $(word 1,$^) -o $@ -c $(word 2,$^)

		$(OUTPUT)/trainticket_root_cause_localization/%.microscope.result.pkl:$(OUTPUT)/trainticket_anomaly_detection.test/%.invo.result.pkl.$(SIGMA).$(FISHER) $(SCRIPT_DIR)/run_localization_microscope.py
		python run_localization_microscope.py -i $(word 1,$^) -o $@

		.PHONY: clean
		clean: clean-cache clean-debug
		rm $(ANOMALY_DETECTION_RESULT) \|\| echo OK
		rm $(OUTPUT)/trainticket_anomaly_detection.test/.result.pkl. \|\| echo OK
		rm $(OUTPUT)/trainticket_anomaly_detection.test/.useful_features. \|\| echo OK
		rm $(OUTPUT)/trainticket_root_cause_localization/.result.pkl. \|\| echo OK
		rm $(OUTPUT)/trainticket_root_cause_localization/*.result.pkl \|\| echo OK
		rm $(OUTPUT)/trainticket_invo_encoded/*.pkl \|\| echo OK
		rm $(OUTPUT)/trainticket_trace_encoded/*.npz \|\| echo OK
		rm $(HISTORICAL_DATA) \|\| echo OK
		rm $(ORIGIN_DATA_DIR)/all/*.pkl \|\| echo OK
		rm $(ANOMALY_DETECTION_MODEL) \|\| echo OK
		rm $(LOCALIZATION_MODEL) \|\| echo OK

		.PHONY: clean-cache
		clean-cache: ;


		.PHONY: clean-debug
		clean-debug:
		rm $(OUTPUT)/trainticket_anomaly_detection.test/selecting_feature.debug/*.pdf \|\| echo


		.PHONY: plot
		plot: $(ANOMALY_DETECTION_RESULT) $(FAULT_LOCALIZATION_RESULT) $(shell ls $(SCRIPT_DIR)/plot/*)
		python plot/run_plot_anomaly_detection.py \
		-i $(ANOMALY_DETECTION_RESULT) \
		-o $(OUTPUT)/figures/anomaly_detection_comparison.pdf
		python plot/run_plot_localization.py \
		-i $(FAULT_LOCALIZATION_RESULT) \
		-o $(OUTPUT)/figures/
		python plot/run_plot_localization_effect_of_trace.py \
		-i $(EFFECT_OF_TRACE_LOCALIZATION_RESULT) \
		-o $(OUTPUT)/figures/effect_of_trace/
		python plot/run_plot_noise_localization.py \
		-i $(FAULT_LOCALIZATION_RESULT) \
		-o $(OUTPUT)/figures/
		python plot/run_plot_noise_localization.py \
		-i $(FAULT_LOCALIZATION_RESULT) \
		-o $(OUTPUT)/figures/
		python plot/run_plot_drop.py --output ../output/figures/drop/

README.md

0 → 100644

+32 −0

原始行号	差异行号	差异行
		# TraceRCA
		Practical Root Cause Localization for Microservice Systems via Trace Analysis. IWQoS 2021


		## Dataset
		The study data is public at
		- OneDrive: https://1drv.ms/u/s!Ao2DxaN2zku_bAUszKmCUiodw94?e=7ThI47
		- Tsinghua Cloud https://cloud.tsinghua.edu.cn/d/8371855eddd64a8db23b/ (中国大陆可访问)


		## Implementation Code

		The experiment workflow is controlled via the Makefile. The input and output of each step can be referred to the Makefile

		- `run_selecting_features.py`: Feature selection
		- `run_anomaly_detection_invo.py`: Anomaly detection based on the useful features
		- `run_localization_association_rule_mining_20210516.py`: Root-cause service ocalization
		- `prepare_train_file_tmp.py` is used to split the dataset into train and test datasets. Note that this step is not included in the Makefile.


		[Presentation Video](https://www.bilibili.com/video/BV14b4y1C7rQ/)
		## Cite
		If the dataset is helpful, please cite the paper.
		``` bibtex
		@inproceedings{li2021practical,
		title={Practical Root Cause Localization for Microservice Systems via Trace Analysis},
		author={Li, Zeyan and Chen, Junjie and Jiao, Rui and Zhao, Nengwen and Wang, Zhijun and Zhang, Shuwei and Wu, Yanjun and Jiang, Long and Yan, Leiqin and Wang, Zikai and others},
		booktitle={IEEE/ACM International Symposium on Quality of Service (IWQoS) 2021},
		year={2021},
		publisher = {{IEEE}}
		}
		```

prepare_train_file_tmp.py

0 → 100644

+24 −0

原始行号	差异行号	差异行
		# prepare train file before more injected data is got
		import random
		import pickle
		from pathlib import Path

		import numpy as np
		from tqdm import tqdm


		def main():
		train_dir = Path('train')
		test_dir = Path('test')
		for test_file in tqdm(list(test_dir.glob("*.pkl"))):
		with open(str(test_file), 'rb') as f:
		test_data = pickle.load(f)
		train_length = int(len(test_data) * 0.2)
		with open(str(train_dir / test_file.name), 'wb+') as f:
		pickle.dump(test_data[:train_length], f)
		with open(str(test_file), 'wb+') as f:
		pickle.dump(test_data[train_length:], f)


		if __name__ == '__main__':
		main()

run_anomaly_detection_collect_result.py

0 → 100644

+85 −0

原始行号	差异行号	差异行
		import click
		from pathlib import Path
		import pandas as pd
		import pickle
		from sklearn.metrics import *
		from loguru import logger
		import numpy as np


		@click.command('collect result main')
		@click.option("-i", "--invo-input", "invo_input_files", multiple=True)
		@click.option("-t", "--trace-input", "trace_input_files", multiple=True)
		@click.option("-o", "--output", "output_file")
		def collect_result_main(invo_input_files, trace_input_files, output_file):
		trace_input_files = list(map(lambda _: Path(_), trace_input_files))
		invo_input_files = list(map(lambda _: Path(_), invo_input_files))
		trace_level_trace_ids = set()
		invo_level_trace_ids = set()
		results = []
		for input_file in trace_input_files:
		with open(input_file, 'rb') as f:
		df = pickle.load(f)
		trace_level_trace_ids \|= set(df.trace_id)
		for input_file in invo_input_files:
		with open(input_file, 'rb') as f:
		df = pickle.load(f)
		invo_level_trace_ids \|= set(df.trace_id)
		trace_ids = trace_level_trace_ids.intersection(invo_level_trace_ids)
		del trace_level_trace_ids, invo_level_trace_ids

		for input_file in trace_input_files:
		with open(input_file, 'rb') as f:
		df = pickle.load(f).set_index(['trace_id'])
		idx = set(df.index.values).intersection(trace_ids)
		y_true = df.loc[idx, 'trace_label'].values
		for algo in ['RF-Trace', 'KNN-Trace', 'MLP-Trace']:
		try:
		y_pred = df.loc[idx, f'{algo}-predict'].values
		except KeyError:
		continue
		for metric_function, metric_name in [
		(f1_score, 'F1-score'), (precision_score, 'Precision'), (recall_score, 'Recall')
		]:
		results.append({
		'metric_value': metric_function(y_true, y_pred),
		'metric_name': metric_name,
		'tp': np.count_nonzero(y_true & y_pred),
		'fp': np.count_nonzero((~y_true) & y_pred),
		'fn': np.count_nonzero(y_true & (~y_pred)),
		'tn': np.count_nonzero((~y_true) & (~y_pred)),
		'method': algo,
		'name': input_file.name.split('.')[0],
		})

		for input_file in invo_input_files:
		with open(input_file, 'rb') as f:
		df = pickle.load(f)
		groupby = df.groupby(by=['trace_id'])
		idx = set(df.trace_id.values).intersection(trace_ids)
		y_true = np.asarray(groupby.first().loc[idx, 'trace_label'].values)
		for method in ["Ours", "NoSelection", "IF"]:
		try:
		y_pred = np.asarray(groupby.sum().loc[idx, f'{method}-predict'].values >= 1)
		except KeyError:
		continue
		for metric_function, metric_name in [
		(f1_score, 'F1-score'), (precision_score, 'Precision'), (recall_score, 'Recall')
		]:
		results.append({
		'metric_value': metric_function(y_true, y_pred),
		'metric_name': metric_name,
		'tp': np.count_nonzero(y_true & y_pred),
		'fp': np.count_nonzero((~y_true) & y_pred),
		'fn': np.count_nonzero(y_true & (~y_pred)),
		'tn': np.count_nonzero((~y_true) & (~y_pred)),
		'method': method,
		'name': input_file.name.split('.')[0],
		})
		results = pd.DataFrame.from_records(results)
		logger.debug(f"results:\n{results}")
		results.to_csv(output_file, index=False)


		if __name__ == '__main__':
		collect_result_main()