加载中 .gitignore 0 → 100644 +6 −0 原始行号 差异行号 差异行 ./nohup.out /rbac/* __pychche__/* *.out .vscode/* /metric/* No newline at end of file README.md 0 → 100644 +52 −0 原始行号 差异行号 差异行 # MicroCBR MicroCBR: Case-based Reasoning on Spatio-temporal Fault Knowledge Graph for Microservices Troubleshooting 源代码地址:https://github.com/Fengrui-Liu/MicroCBR ## k8s experiment ### demo project: 1. [Online Boutique with OpenTelemetry](https://github.com/julianocosta89/opentelemetry-microservices-demo) 2. [Sock shop with OpenTelemetry](https://github.com/microservices-demo/microservices-demo) 3. [Train ticket with OpenTelemetry](https://github.com/FudanSELab/train-ticket) 4. [Banks](https://github.com/GoogleCloudPlatform/bank-of-anthos) # exporters 1. [blackbox-exporter](https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus-blackbox-exporter) 2. [mongodb-exporter](https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus-mongodb-exporter) 3. [mysql-exporter](https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus-mysql-exporter) 4. [rabbitmq-exporter](https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus-rabbitmq-exporter) 5. [redis-exporter](https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus-redis-exporter) 6. [jmx-exporter](https://github.com/prometheus/jmx_exporter) 7. [kube-state-metrics](https://github.com/kubernetes/kube-state-metrics) 8. [ping_exporter](https://github.com/czerwonk/ping_exporter) ### chaos tool: 1. [Chaos-mesh](https://github.com/chaos-mesh/chaos-mesh) 2. Manual injection ## Fault injection 1. k8s experiment 1. Pod failure, kill, container kill 2. Network disconnection, partition, high delays, high packet loss rate, packet reordering, limit bandwidth 3. Stress of CPU, memory 4. DNS error, wrong 5. Time faults for clock skew 6. JVM cpu-count stress, memory-type stress, trigger garbage collection 7. HTTP patch 2. physical nodes # Knowledge base description (KNOWLEDGE_BASE.yaml) * instance_related: whether the kb is peculiar to instance or not * metrics: prometheus metrics, some of them rely on exporters * index: index for queries from [QUERY.yaml](chaos-simulator/dev/METRIC.yaml) * traces: Jaeger traces * logs: k8s logs and application logs * cmd: command line operations chaos-simulator/README.md 0 → 100644 +52 −0 原始行号 差异行号 差异行 ``` ├── README.md ├── chaos_experiment │ ├── chaos_generate.py # Generate chaos experiment yaml file │ ├── chaos_generate_example.ipynb │ └── templates # Templates for chaos experiment │ └── Serial │ ├── container-kill-serial.yaml │ ├── dns-error-serial.yaml │ ├── dns-random-serial.yaml │ ├── http-abort-serial.yaml │ ├── http-patch-body-serial.yaml │ ├── http-patch-head-serial.yaml │ ├── io-attr-serial.yaml │ ├── io-fault-serial.yaml │ ├── io-latency-serial.yaml │ ├── io-mistake-serial.yaml │ ├── jvm-gc-serial.yaml │ ├── jvm-stress-cpu-serial.yaml │ ├── jvm-stress-memory-heap-serial.yaml │ ├── jvm-stress-memory-stack-serial.yaml │ ├── network-bandwidth-serial.yaml │ ├── network-corrupt-serial.yaml │ ├── network-delay-external-target-serial.yaml │ ├── network-delay-serial.yaml │ ├── network-delay-target-serial-both.yaml │ ├── network-delay-target-serial-from.yaml │ ├── network-delay-target-serial-to.yaml │ ├── network-duplicate-serial.yaml │ ├── network-loss-serial.yaml │ ├── network-partition-external-target-serial.yaml │ ├── network-partition-target-serial.yaml │ ├── pod-failure-serial.yaml │ ├── pod-kill-serial.yaml │ ├── stress-cpu-serial.yaml │ ├── stress-memory-serial.yaml │ └── time-serial.yaml ├── config │ ├── CBR-dashboard.json # Can be imported to Grafana │ ├── CHAOS.yaml # Chaos experiment index and management │ ├── CMD.yaml │ ├── KNOWLEDGE_BASE.yaml │ ├── LOG.yaml │ ├── METRIC.yaml │ └── TRACE.yaml └── dev ├── chaos.py ├── client_example.ipynb ├── jaeger.py └── prometheus.py ``` No newline at end of file chaos-simulator/chaos_experiment/__pycache__/chaos_generate.cpython-39.pyc 0 → 100644 +3.5 KB 添加文件。此文件类型的文件没有差异预览。 查看文件 chaos-simulator/chaos_experiment/chaos_generate.py 0 → 100644 +122 −0 原始行号 差异行号 差异行 import logging import os import subprocess from typing import Union import yaml _LOGGER = logging.getLogger(__name__) class Chaos_Generate: def __init__(self): self.template = None self.name = "default" self.type = None def load_template(self, f_path: str) -> Union[dict, None]: """Load chaos template Args: f_path (str): chaos template path Returns: dict: chaos template """ if not os.path.exists(f_path): _LOGGER.error("Error chaos template path, %s" % (f_path)) return None f = open(f_path, "r", encoding="utf-8") data = f.read() f.close() self.template = yaml.safe_load(data) self.name = self.template["metadata"]["name"] self.type = self.template["kind"] return self.template def generate_by_pods( self, namespace: str, pods: list, types: str = "Serial", output_dir: str = "./exp/", ): """Generate chaos experiment by pods Args: namespace (str): microservice namespace. pods (list): A list of pods to inject. types (str, optional): Serial, once a time. Defaults to "Serial". output_dir (str, optional): Data collection path. Defaults to "./exp/". """ if self.template is None: _LOGGER.error("Error, no chaos template loaded") return self.clear_experiments( types=types, namespace=namespace, pods=pods, output_dir=output_dir, ) _LOGGER.info("Remove old experiments") for pod in pods: name_config = self.name + "-" + namespace + "-" + pod self.template["metadata"]["name"] = name_config pod_config = {namespace: [pod]} self.template["spec"]["selector"]["pods"] = pod_config if "target" in self.template["spec"]: self.template["spec"]["target"]["selector"]["namespaces"] = [ namespace ] f_path = output_dir + "/" + types + "/" + name_config + ".yaml" os.makedirs(os.path.dirname(f_path), exist_ok=True) with open(f_path, "w") as f: yaml.safe_dump(self.template, f) return def clear_experiments( self, namespace: str, pods: list, types: str = "Serial", output_dir: str = "./experiments/", ): """Clear an existing experiment Args: namespace (str): microservice namespace pods (list): A list of injected pods types (str, optional): Serial experiment, once a time. Defaults to "Serial". output_dir (str, optional): Collection data dir. Defaults to "./experiments/". """ for pod in pods: name_config = self.name + "-" + namespace + "-" + pod f_path = output_dir + "/" + types + "/" + name_config + ".yaml" if os.path.exists(f_path): cmd = "kubectl delete -f {f_path} -n {namespace}".format( f_path=f_path, namespace=namespace ) stat = subprocess.run( cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) if stat.returncode == 0: _LOGGER.info("Remove experiment %s" % (f_path)) elif stat.returncode == 1: _LOGGER.info("Experiment %s not exist" % (f_path)) else: _LOGGER.error( "Return code: {}. {}".format( stat.returncode, stat.stderr.decode("utf-8") ) ) os.remove(f_path) else: _LOGGER.warn("YAML file %s not exist" % (f_path)) 加载中
.gitignore 0 → 100644 +6 −0 原始行号 差异行号 差异行 ./nohup.out /rbac/* __pychche__/* *.out .vscode/* /metric/* No newline at end of file
README.md 0 → 100644 +52 −0 原始行号 差异行号 差异行 # MicroCBR MicroCBR: Case-based Reasoning on Spatio-temporal Fault Knowledge Graph for Microservices Troubleshooting 源代码地址:https://github.com/Fengrui-Liu/MicroCBR ## k8s experiment ### demo project: 1. [Online Boutique with OpenTelemetry](https://github.com/julianocosta89/opentelemetry-microservices-demo) 2. [Sock shop with OpenTelemetry](https://github.com/microservices-demo/microservices-demo) 3. [Train ticket with OpenTelemetry](https://github.com/FudanSELab/train-ticket) 4. [Banks](https://github.com/GoogleCloudPlatform/bank-of-anthos) # exporters 1. [blackbox-exporter](https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus-blackbox-exporter) 2. [mongodb-exporter](https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus-mongodb-exporter) 3. [mysql-exporter](https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus-mysql-exporter) 4. [rabbitmq-exporter](https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus-rabbitmq-exporter) 5. [redis-exporter](https://github.com/prometheus-community/helm-charts/tree/main/charts/prometheus-redis-exporter) 6. [jmx-exporter](https://github.com/prometheus/jmx_exporter) 7. [kube-state-metrics](https://github.com/kubernetes/kube-state-metrics) 8. [ping_exporter](https://github.com/czerwonk/ping_exporter) ### chaos tool: 1. [Chaos-mesh](https://github.com/chaos-mesh/chaos-mesh) 2. Manual injection ## Fault injection 1. k8s experiment 1. Pod failure, kill, container kill 2. Network disconnection, partition, high delays, high packet loss rate, packet reordering, limit bandwidth 3. Stress of CPU, memory 4. DNS error, wrong 5. Time faults for clock skew 6. JVM cpu-count stress, memory-type stress, trigger garbage collection 7. HTTP patch 2. physical nodes # Knowledge base description (KNOWLEDGE_BASE.yaml) * instance_related: whether the kb is peculiar to instance or not * metrics: prometheus metrics, some of them rely on exporters * index: index for queries from [QUERY.yaml](chaos-simulator/dev/METRIC.yaml) * traces: Jaeger traces * logs: k8s logs and application logs * cmd: command line operations
chaos-simulator/README.md 0 → 100644 +52 −0 原始行号 差异行号 差异行 ``` ├── README.md ├── chaos_experiment │ ├── chaos_generate.py # Generate chaos experiment yaml file │ ├── chaos_generate_example.ipynb │ └── templates # Templates for chaos experiment │ └── Serial │ ├── container-kill-serial.yaml │ ├── dns-error-serial.yaml │ ├── dns-random-serial.yaml │ ├── http-abort-serial.yaml │ ├── http-patch-body-serial.yaml │ ├── http-patch-head-serial.yaml │ ├── io-attr-serial.yaml │ ├── io-fault-serial.yaml │ ├── io-latency-serial.yaml │ ├── io-mistake-serial.yaml │ ├── jvm-gc-serial.yaml │ ├── jvm-stress-cpu-serial.yaml │ ├── jvm-stress-memory-heap-serial.yaml │ ├── jvm-stress-memory-stack-serial.yaml │ ├── network-bandwidth-serial.yaml │ ├── network-corrupt-serial.yaml │ ├── network-delay-external-target-serial.yaml │ ├── network-delay-serial.yaml │ ├── network-delay-target-serial-both.yaml │ ├── network-delay-target-serial-from.yaml │ ├── network-delay-target-serial-to.yaml │ ├── network-duplicate-serial.yaml │ ├── network-loss-serial.yaml │ ├── network-partition-external-target-serial.yaml │ ├── network-partition-target-serial.yaml │ ├── pod-failure-serial.yaml │ ├── pod-kill-serial.yaml │ ├── stress-cpu-serial.yaml │ ├── stress-memory-serial.yaml │ └── time-serial.yaml ├── config │ ├── CBR-dashboard.json # Can be imported to Grafana │ ├── CHAOS.yaml # Chaos experiment index and management │ ├── CMD.yaml │ ├── KNOWLEDGE_BASE.yaml │ ├── LOG.yaml │ ├── METRIC.yaml │ └── TRACE.yaml └── dev ├── chaos.py ├── client_example.ipynb ├── jaeger.py └── prometheus.py ``` No newline at end of file
chaos-simulator/chaos_experiment/__pycache__/chaos_generate.cpython-39.pyc 0 → 100644 +3.5 KB 添加文件。此文件类型的文件没有差异预览。 查看文件
chaos-simulator/chaos_experiment/chaos_generate.py 0 → 100644 +122 −0 原始行号 差异行号 差异行 import logging import os import subprocess from typing import Union import yaml _LOGGER = logging.getLogger(__name__) class Chaos_Generate: def __init__(self): self.template = None self.name = "default" self.type = None def load_template(self, f_path: str) -> Union[dict, None]: """Load chaos template Args: f_path (str): chaos template path Returns: dict: chaos template """ if not os.path.exists(f_path): _LOGGER.error("Error chaos template path, %s" % (f_path)) return None f = open(f_path, "r", encoding="utf-8") data = f.read() f.close() self.template = yaml.safe_load(data) self.name = self.template["metadata"]["name"] self.type = self.template["kind"] return self.template def generate_by_pods( self, namespace: str, pods: list, types: str = "Serial", output_dir: str = "./exp/", ): """Generate chaos experiment by pods Args: namespace (str): microservice namespace. pods (list): A list of pods to inject. types (str, optional): Serial, once a time. Defaults to "Serial". output_dir (str, optional): Data collection path. Defaults to "./exp/". """ if self.template is None: _LOGGER.error("Error, no chaos template loaded") return self.clear_experiments( types=types, namespace=namespace, pods=pods, output_dir=output_dir, ) _LOGGER.info("Remove old experiments") for pod in pods: name_config = self.name + "-" + namespace + "-" + pod self.template["metadata"]["name"] = name_config pod_config = {namespace: [pod]} self.template["spec"]["selector"]["pods"] = pod_config if "target" in self.template["spec"]: self.template["spec"]["target"]["selector"]["namespaces"] = [ namespace ] f_path = output_dir + "/" + types + "/" + name_config + ".yaml" os.makedirs(os.path.dirname(f_path), exist_ok=True) with open(f_path, "w") as f: yaml.safe_dump(self.template, f) return def clear_experiments( self, namespace: str, pods: list, types: str = "Serial", output_dir: str = "./experiments/", ): """Clear an existing experiment Args: namespace (str): microservice namespace pods (list): A list of injected pods types (str, optional): Serial experiment, once a time. Defaults to "Serial". output_dir (str, optional): Collection data dir. Defaults to "./experiments/". """ for pod in pods: name_config = self.name + "-" + namespace + "-" + pod f_path = output_dir + "/" + types + "/" + name_config + ".yaml" if os.path.exists(f_path): cmd = "kubectl delete -f {f_path} -n {namespace}".format( f_path=f_path, namespace=namespace ) stat = subprocess.run( cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) if stat.returncode == 0: _LOGGER.info("Remove experiment %s" % (f_path)) elif stat.returncode == 1: _LOGGER.info("Experiment %s not exist" % (f_path)) else: _LOGGER.error( "Return code: {}. {}".format( stat.returncode, stat.stderr.decode("utf-8") ) ) os.remove(f_path) else: _LOGGER.warn("YAML file %s not exist" % (f_path))