Merge branch 'master' into 'main' (0d2860ad) · 提交 · AIOps-NanKai / model / Eadro

Readme.md

0 → 100644

+46 −0

原始行号	差异行号	差异行
		<img width="200" alt="截屏2022-09-19 下午9 50 34" src="https://user-images.githubusercontent.com/112700133/191033061-ea4a1671-26c7-4d52-b3ed-3495a2ae0292.png">

		![](https://img.shields.io/badge/version-0.1-green.svg)

		****
		Artifacts accompanying the paper Eadro: An End-to-End Troubleshooting Framework for Microservices on Multi-source Data published at ICSE 2023.
		This tool try to model the intra- and inter-dependencies between microservices for troubleshooting, enabling end-to-end anomaly detection and root cause localization.

		<img width="400" alt="dependency_00" src="https://user-images.githubusercontent.com/112700133/191036446-d4cf8d07-bd4e-4452-a3e2-f7d4e9da0624.png">

		## Data
		Our data are at https://doi.org/10.5281/zenodo.7615393.

		## Dependencies
		`pip install -r requirements.txt`

		## Run
		`cd codes & python main.py --data <dataset>`


		## Architecture
		![Eadro](https://user-images.githubusercontent.com/49298462/217256928-f0d61857-678b-4456-a024-359326a2c45d.png)

		## Folder Structure
		```
		.
		├── README.md
		├── codes
		│ ├── base.py traing and test
		│ ├── main.py lanuch the framework
		│ ├── model.py the main body (model) of the work
		│ ├── preprocess data preprocess
		│ │ ├── align.py align different data sources according to the time
		│ │ ├── single_process.py handle each data source individually
		│ │ └── util.py
		│ └── utils.py
		├── requirements.txt
		└── structure.txt
		```

		## UI
		The final visualized page should be like:
		<img width="1919" alt="截屏2023-02-07 下午9 28 22" src="https://user-images.githubusercontent.com/49298462/217257747-e53afafe-ea3f-4024-8760-34d0963a863d.png">

		## Concact us
		🍺 Feel free to leave messages in "Issues"!

codes/base.py

0 → 100644

+124 −0

原始行号	差异行号	差异行
		import os
		import time
		import copy

		import torch
		from torch import nn
		import logging

		from model import MainModel
		from sklearn.metrics import ndcg_score

		class BaseModel(nn.Module):
		def __init__(self, event_num, metric_num, node_num, device, lr=1e-3, epoches=50, patience=5, result_dir='./', hash_id=None, **kwargs):
		super(BaseModel, self).__init__()

		self.epoches = epoches
		self.lr = lr
		self.patience = patience # > 0: use early stop
		self.device = device

		self.model_save_dir = os.path.join(result_dir, hash_id)
		self.model = MainModel(event_num, metric_num, node_num, device, **kwargs)
		self.model.to(device)

		def evaluate(self, test_loader, datatype="Test"):
		self.model.eval()
		hrs, ndcgs = np.zeros(5), np.zeros(5)
		TP, FP, FN = 0, 0, 0
		batch_cnt, epoch_loss = 0, 0.0

		with torch.no_grad():
		for graph, ground_truths in test_loader:
		res = self.model.forward(graph.to(self.device), ground_truths)
		for idx, faulty_nodes in enumerate(res["y_pred"]):
		culprit = ground_truths[idx].item()
		if culprit == -1:
		if faulty_nodes[0] == -1: TP+=1
		else: FP += 1
		else:
		if faulty_nodes[0] == -1: FN+=1
		else:
		TP+=1
		rank = list(faulty_nodes).index(culprit)
		for j in range(5):
		hrs[j] += int(rank <= j)
		ndcgs[j] += ndcg_score([res["y_prob"][idx]], [res["pred_prob"][idx]], k=j+1)
		epoch_loss += res["loss"].item()
		batch_cnt += 1

		pos = TP+FN
		eval_results = {
		"F1": TP*2.0/(TP+FP+pos) if (TP+FP+pos)>0 else 0,
		"Rec": TP*1.0/pos if pos > 0 else 0,
		"Pre": TP*1.0/(TP+FP) if (TP+FP) > 0 else 0}

		for j in [1, 3, 5]:
		eval_results["HR@"+str(j)] = hrs[j-1]*1.0/pos
		eval_results["ndcg@"+str(j)] = ndcgs[j-1]*1.0/pos

		logging.info("{} -- {}".format(datatype, ", ".join([k+": "+str(f"{v:.4f}") for k, v in eval_results.items()])))

		return eval_results

		def fit(self, train_loader, test_loader=None, evaluation_epoch=10):
		best_hr1, coverage, best_state, eval_res = -1, None, None, None # evaluation
		pre_loss, worse_count = float("inf"), 0 # early break

		optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr)
		#optimizer = torch.optim.SGD(self.model.parameters(), lr=self.lr, momentum=0.99)

		for epoch in range(1, self.epoches+1):
		self.model.train()
		batch_cnt, epoch_loss = 0, 0.0
		epoch_time_start = time.time()
		for graph, label in train_loader:
		optimizer.zero_grad()
		loss = self.model.forward(graph.to(self.device), label)['loss']
		loss.backward()
		# if self.debug:
		# for name, parms in self.model.named_parameters():
		# if name=='encoder.graph_model.net.weight':
		# print(name, "--> grad:",parms.grad)
		optimizer.step()
		epoch_loss += loss.item()
		batch_cnt += 1
		epoch_time_elapsed = time.time() - epoch_time_start

		epoch_loss = epoch_loss / batch_cnt
		logging.info("Epoch {}/{}, training loss: {:.5f} [{:.2f}s]".format(epoch, self.epoches, epoch_loss, epoch_time_elapsed))

		####### early break #######
		if epoch_loss > pre_loss:
		worse_count += 1
		if self.patience > 0 and worse_count >= self.patience:
		logging.info("Early stop at epoch: {}".format(epoch))
		break
		else: worse_count = 0
		pre_loss = epoch_loss

		####### Evaluate test data during training #######
		if (epoch+1) % evaluation_epoch == 0:
		test_results = self.evaluate(test_loader, datatype="Test")
		if test_results["HR@1"] > best_hr1:
		best_hr1, eval_res, coverage = test_results["HR@1"], test_results, epoch
		best_state = copy.deepcopy(self.model.state_dict())

		self.save_model(best_state)

		if coverage > 5:
		logging.info("* Best result got at epoch {} with HR@1: {:.4f}".format(coverage, best_hr1))
		else:
		logging.info("Unable to convergence!")

		return eval_res, coverage

		def load_model(self, model_save_file=""):
		self.model.load_state_dict(torch.load(model_save_file, map_location=self.device))

		def save_model(self, state, file=None):
		if file is None: file = os.path.join(self.model_save_dir, "model.ckpt")
		try:
		torch.save(state, file, _use_new_zipfile_serialization=False)
		except:
		torch.save(state, file)

codes/main.py

0 → 100644

+102 −0

原始行号	差异行号	差异行
		from torch.utils.data import Dataset, DataLoader
		import torch
		import dgl
		class chunkDataset(Dataset): #[node_num, T, else]
		def __init__(self, chunks, node_num, edges):
		self.data = []
		self.idx2id = {}
		for idx, chunk_id in enumerate(chunks.keys()):
		self.idx2id[idx] = chunk_id
		chunk = chunks[chunk_id]
		graph = dgl.graph(edges, num_nodes=node_num)
		graph.ndata["logs"] = torch.FloatTensor(chunk["logs"])
		graph.ndata["metrics"] = torch.FloatTensor(chunk["metrics"])
		graph.ndata["traces"] = torch.FloatTensor(chunk["traces"])
		self.data.append((graph, chunk["culprit"]))

		def __len__(self):
		return len(self.data)
		def __getitem__(self, idx):
		return self.data[idx]
		def __get_chunk_id__(self, idx):
		return self.idx2id[idx]

		from utils import *
		from base import BaseModel

		import argparse
		parser = argparse.ArgumentParser()
		parser.add_argument("--random_seed", default=42, type=int)

		### Training params
		parser.add_argument("--gpu", default=True, type=lambda x: x.lower() == "true")
		parser.add_argument("--epoches", default=50, type=int)
		parser.add_argument("--batch_size", default=256, type=int)
		parser.add_argument("--lr", default=0.001, type=float)
		parser.add_argument("--patience", default=10, type=int)

		##### Fuse params
		parser.add_argument("--self_attn", default=True, type=lambda x: x.lower() == "true")
		parser.add_argument("--fuse_dim", default=128, type=int)
		parser.add_argument("--alpha", default=0.5, type=float)
		parser.add_argument("--locate_hiddens", default=[64], type=int, nargs='+')
		parser.add_argument("--detect_hiddens", default=[64], type=int, nargs='+')

		##### Source params
		parser.add_argument("--log_dim", default=16, type=int)
		parser.add_argument("--trace_kernel_sizes", default=[2], type=int, nargs='+')
		parser.add_argument("--trace_hiddens", default=[64], type=int, nargs='+')
		parser.add_argument("--metric_kernel_sizes", default=[2], type=int, nargs='+')
		parser.add_argument("--metric_hiddens", default=[64], type=int, nargs='+')
		parser.add_argument("--graph_hiddens", default=[64], type=int, nargs='+')
		parser.add_argument("--attn_head", default=4, type=int, help="For gat or gat-v2")
		parser.add_argument("--activation", default=0.2, type=float, help="use LeakyReLU, shoule be in (0,1)")

		##### Data params
		parser.add_argument("--data", type=str, required=True)
		parser.add_argument("--result_dir", default="../result/")

		params = vars(parser.parse_args())

		import logging
		def get_device(gpu):
		if gpu and torch.cuda.is_available():
		logging.info("Using GPU...")
		return torch.device("cuda")
		logging.info("Using CPU...")
		return torch.device("cpu")


		def collate(data):
		graphs, labels = map(list, zip(*data))
		batched_graph = dgl.batch(graphs)
		return batched_graph , torch.tensor(labels)

		def run(evaluation_epoch=10):
		data_dir = os.path.join("../chunks", params["data"])

		metadata = read_json(os.path.join(data_dir, "metadata.json"))
		event_num, node_num, metric_num = metadata["event_num"], metadata["node_num"], metadata["metric_num"]
		params["chunk_lenth"] = metadata["chunk_lenth"]

		hash_id = dump_params(params)
		params["hash_id"] = hash_id
		seed_everything(params["random_seed"])
		device = get_device(params["gpu"])

		train_chunks, test_chunks = load_chunks(data_dir)

		train_data = chunkDataset(train_chunks, node_num)
		test_data = chunkDataset(test_chunks, node_num)

		train_dl = DataLoader(train_data, batch_size=params["batch_size"], shuffle=True, collate_fn=collate, pin_memory=True)
		test_dl = DataLoader(test_data, batch_size=params["batch_size"], shuffle=False, collate_fn=collate, pin_memory=True)

		model = BaseModel(event_num, metric_num, node_num, device, **params)
		scores, converge = model.fit(train_dl, test_dl, evaluation_epoch=evaluation_epoch)

		dump_scores(params["result_dir"], hash_id, scores, converge)
		logging.info("Current hash_id {}".format(hash_id))

		if "__main__" == __name__:
		run()

codes/model.py

0 → 100644

+257 −0

原始行号	差异行号	差异行
		import torch
		from torch import nn
		from dgl.nn.pytorch import GATv2Conv
		from dgl.nn import GlobalAttentionPooling

		class GraphModel(nn.Module):
		def __init__(self, in_dim, graph_hiddens=[64, 128], device='cpu', attn_head=4, activation=0.2, **kwargs):
		super(GraphModel, self).__init__()
		'''
		Params:
		in_dim: the feature dim of each node
		'''
		layers = []

		for i, hidden in enumerate(graph_hiddens):
		in_feats = graph_hiddens[i-1] if i > 0 else in_dim
		dropout = kwargs["attn_drop"] if "attn_drop" in kwargs else 0
		layers.append(GATv2Conv(in_feats, out_feats=hidden, num_heads=attn_head,
		attn_drop=dropout, negative_slope=activation, allow_zero_in_degree=True))
		self.maxpool = nn.MaxPool1d(attn_head)

		self.net = nn.Sequential(*layers).to(device)
		self.out_dim = graph_hiddens[-1]
		self.pooling = GlobalAttentionPooling(nn.Linear(self.out_dim, 1))


		def forward(self, graph, x):
		'''
		Input:
		x -- tensor float [batch_size*node_num, feature_in_dim] N = {s1, s2, s3, e1, e2, e3}
		'''
		out = None
		for layer in self.net:
		if out is None: out = x
		out = layer(graph, out)
		out = self.maxpool(out.permute(0, 2, 1)).permute(0, 2, 1).squeeze()
		return self.pooling(graph, out) #[bz*node, out_dim] --> [bz, out_dim]

		class Chomp1d(nn.Module):
		def __init__(self, chomp_size):
		super(Chomp1d, self).__init__()
		self.chomp_size = chomp_size
		def forward(self, x):
		return x[:, :, :-self.chomp_size].contiguous()

		class ConvNet(nn.Module):
		def __init__(self, num_inputs, num_channels, kernel_sizes, dilation=2, dev="cpu"):
		super(ConvNet, self).__init__()
		layers = []
		for i in range(len(kernel_sizes)):
		dilation_size = dilation ** i
		kernel_size = kernel_sizes[i]
		padding = (kernel_size-1) * dilation_size
		in_channels = num_inputs if i == 0 else num_channels[i-1]
		out_channels = num_channels[i]
		layers += [nn.Conv1d(in_channels, out_channels, kernel_size, stride=1, dilation=dilation_size, padding=padding),
		nn.BatchNorm1d(out_channels), nn.ReLU(), Chomp1d(padding)]

		self.network = nn.Sequential(*layers)

		self.out_dim = num_channels[-1]
		self.network.to(dev)


		def forward(self, x): #[batch_size, T, in_dim]
		x = x.permute(0, 2, 1).float() #[batch_size, in_dim, T]
		out = self.network(x) #[batch_size, out_dim, T]
		out = out.permute(0, 2, 1) #[batch_size, T, out_dim]
		return out

		import math
		class SelfAttention(nn.Module):
		def __init__(self, input_size, seq_len):
		"""
		Args:
		input_size: int, hidden_size * num_directions
		seq_len: window_size
		"""
		super(SelfAttention, self).__init__()
		self.atten_w = nn.Parameter(torch.randn(seq_len, input_size, 1))
		self.atten_bias = nn.Parameter(torch.randn(seq_len, 1, 1))
		self.glorot(self.atten_w)
		self.atten_bias.data.fill_(0)

		def forward(self, x):
		# x: [batch_size, window_size, input_size]
		input_tensor = x.transpose(1, 0) # w x b x h
		input_tensor = (torch.bmm(input_tensor, self.atten_w) + self.atten_bias) # w x b x out
		input_tensor = input_tensor.transpose(1, 0)
		atten_weight = input_tensor.tanh()
		weighted_sum = torch.bmm(atten_weight.transpose(1, 2), x).squeeze()
		return weighted_sum

		def glorot(self, tensor):
		if tensor is not None:
		stdv = math.sqrt(6.0 / (tensor.size(-2) + tensor.size(-1)))
		tensor.data.uniform_(-stdv, stdv)

		class TraceModel(nn.Module):
		def __init__(self, device='cpu', trace_hiddens=[20, 50], trace_kernel_sizes=[3, 3], self_attn=False, chunk_lenth=None, **kwargs):
		super(TraceModel, self).__init__()

		self.out_dim = trace_hiddens[-1]
		assert len(trace_hiddens) == len(trace_kernel_sizes)
		self.net = ConvNet(1, num_channels=trace_hiddens, kernel_sizes=trace_kernel_sizes,
		dev=device, dropout=trace_dropout)

		self.self_attn = self_attn
		if self_attn:
		assert (chunk_lenth is not None)
		self.attn_layer = SelfAttention(self.out_dim, chunk_lenth)

		def forward(self, x: torch.tensor): #[bz, T, 1]
		hidden_states = self.net(x)
		if self.self_attn:
		return self.attn_layer(hidden_states)
		return hidden_states[:,-1,:] #[bz, out_dim]

		class MetricModel(nn.Module):
		def __init__(self, metric_num, device='cpu', metric_hiddens=[64, 128], metric_kernel_sizes=[3, 3], self_attn=False, chunk_lenth=None, **kwargs):
		super(MetricModel, self).__init__()
		self.metric_num = metric_num
		self.out_dim = metric_hiddens[-1]
		in_dim = metric_num

		assert len(metric_hiddens) == len(metric_kernel_sizes)
		self.net = ConvNet(num_inputs=in_dim, num_channels=metric_hiddens, kernel_sizes=metric_kernel_sizes,
		dev=device, dropout=metric_dropout)

		self.self_attn = self_attn
		if self_attn:
		assert (chunk_lenth is not None)
		self.attn_layer = SelfAttention(self.out_dim, chunk_lenth)


		def forward(self, x): #[bz, T, metric_num]
		assert x.shape[-1] == self.metric_num
		hidden_states = self.net(x)
		if self.self_attn:
		return self.attn_layer(hidden_states)
		return hidden_states[:,-1,:] #[bz, out_dim]

		class LogModel(nn.Module):
		def __init__(self, event_num, out_dim):
		super(LogModel, self).__init__()
		self.embedder = nn.Linear(event_num, out_dim)
		def forward(self, paras: torch.tensor): #[bz, event_num]
		"""
		Input:
		paras: mu with length of event_num
		"""
		return self.embedder(paras)

		class MultiSourceEncoder(nn.Module):
		def __init__(self, event_num, metric_num, node_num, device, log_dim=64, fuse_dim=64, alpha=0.5, **kwargs):
		super(MultiSourceEncoder, self).__init__()
		self.node_num = node_num
		self.alpha = alpha

		self.trace_model = TraceModel(device=device, **kwargs)
		trace_dim = self.trace_model.out_dim
		self.log_model = LogModel(event_num, log_dim)
		self.metric_model = MetricModel(metric_num, device=device, **kwargs)
		metric_dim = self.metric_model.out_dim
		fuse_in = trace_dim+log_dim+metric_dim

		if not fuse_dim % 2 == 0: fuse_dim += 1
		self.fuse = nn.Linear(fuse_in, fuse_dim)

		self.activate = nn.GLU()
		self.feat_in_dim = int(fuse_dim // 2)


		self.status_model = GraphModel(in_dim=self.feat_in_dim, device=device, **kwargs)
		self.feat_out_dim = self.status_model.out_dim

		def forward(self, graph):
		trace_embedding = self.trace_model(graph.ndata["traces"]) #[bz*node_num, T, trace_dim]
		log_embedding = self.log_model(graph.ndata["logs"]) #[bz*node_num, log_dim]
		metric_embedding = self.metric_model(graph.ndata["metrics"]) #[bz*node_num, metric_dim]

		# [bz*node_num, fuse_in] --> [bz, fuse_out], fuse_in: sum of dims from multi sources
		feature = self.activate(self.fuse(torch.cat((trace_embedding, log_embedding, metric_embedding), dim=-1))) #[bz*node_num, node_dim]
		embeddings = self.status_model(graph, feature) #[bz, graph_dim]
		return embeddings

		class FullyConnected(nn.Module):
		def __init__(self, in_dim, out_dim, linear_sizes):
		super(FullyConnected, self).__init__()
		layers = []
		for i, hidden in enumerate(linear_sizes):
		input_size = in_dim if i == 0 else linear_sizes[i-1]
		layers += [nn.Linear(input_size, hidden), nn.ReLU()]
		layers += [nn.Linear(linear_sizes[-1], out_dim)]
		self.net = nn.Sequential(*layers)

		def forward(self, x: torch.Tensor): #[batch_size, in_dim]
		return self.net(x)

		import numpy as np
		class MainModel(nn.Module):
		def __init__(self, event_num, metric_num, node_num, device, alpha=0.5, debug=False, **kwargs):
		super(MainModel, self).__init__()

		self.device = device
		self.node_num = node_num
		self.alpha = alpha

		self.encoder = MultiSourceEncoder(event_num, metric_num, node_num, device, debug=debug, alpha=alpha, **kwargs)

		self.detecter = FullyConnected(self.encoder.feat_out_dim, 2, kwargs['detect_hiddens']).to(device)
		self.detecter_criterion = nn.CrossEntropyLoss()
		self.localizer = FullyConnected(self.encoder.feat_out_dim, node_num, kwargs['locate_hiddens']).to(device)
		self.localizer_criterion = nn.CrossEntropyLoss(ignore_index=-1)
		self.get_prob = nn.Softmax(dim=-1)

		def forward(self, graph, fault_indexs):
		batch_size = graph.batch_size
		embeddings = self.encoder(graph) #[bz, feat_out_dim]

		y_prob = torch.zeros((batch_size, self.node_num)).to(self.device)
		for i in range(batch_size):
		if fault_indexs[i] > -1:
		y_prob[i, fault_indexs[i]] = 1
		y_anomaly = torch.zeros(batch_size).long().to(self.device)
		for i in range(batch_size):
		y_anomaly[i] = int(fault_indexs[i] > -1)


		locate_logits = self.locator(embeddings)
		locate_loss = self.locator_criterion(locate_logits, fault_indexs.to(self.device))
		detect_logits = self.detector(embeddings)
		detect_loss = self.decoder_criterion(detect_logits, y_anomaly)
		loss = self.alpha * detect_loss + (1-self.alpha) * locate_loss

		node_probs = self.get_prob(locate_logits.detach()).cpu().numpy()
		y_pred = self.inference(batch_size, node_probs, detect_logits)

		return {'loss': loss, 'y_pred': y_pred, 'y_prob': y_prob.detach().cpu().numpy(), 'pred_prob': node_probs}

		def inference(self, batch_size, node_probs, detect_logits=None):
		node_list = np.flip(node_probs.argsort(axis=1), axis=1)

		y_pred = []
		for i in range(batch_size):
		detect_pred = detect_logits.detach().cpu().numpy().argmax(axis=1).squeeze()
		if detect_pred[i] < 1: y_pred.append([-1])
		else: y_pred.append(node_list[i])

		return y_pred

codes/preprocess/align.py

0 → 100644

+217 −0

添加文件。

预览已超出大小限制，变更已折叠。