提交 d2789463 编辑于 作者: Haowen Xu's avatar Haowen Xu
浏览文件

code is now well-organized

上级
加载中
加载中
加载中
加载中

.coveragerc

0 → 100644
+22 −0
原始行号 差异行号 差异行
[report]
# Regexes for lines to exclude from consideration
exclude_lines =
    # Have to re-enable the standard pragma
    pragma: no cover

    # Don't complain about missing debug-only code:
    if self\.debug

    # Don't complain if tests don't hit defensive assertion code:
    raise AssertionError
    raise NotImplementedError

    # Don't complain if non-runnable code isn't run:
    if 0:
    if __name__ == .__main__.:


[run]
omit =
    # test code need not coverage statistics
    tests/*

.gitignore

0 → 100644
+8 −0
原始行号 差异行号 差异行
.idea
.cache
*.iml
/debug.py
/.coverage
*.pyc
.DS_Store
*.*~

README.rst

0 → 100644
+134 −0
原始行号 差异行号 差异行
DONUT
=====

Donut is an anomaly detection algorithm for periodic KPIs.

Installation
------------

Checkout this repository and execute:

.. code-block:: bash

    pip install git+https://github.com/thu-ml/zhusuan.git
    pip install git+https://github.com/korepwx/tfsnippet.git
    pip install .

This will first install `ZhuSuan`_ and `TFSnippet`_, the two major dependencies
of Donut, then install the Donut package itself.

_`ZhuSuan`: https://github.com/thu-ml/zhusuan
_`TFSnippet`: https://github.com/korepwx/tfsnippet

API Usage
---------

To prepare the data:

.. code-block:: python

    import numpy as np
    from donut import complete_timestamp, standardize_kpi

    # Read the raw data.
    timestamp, values, labels = ...
    # If there is no label, simply use all zeros.
    labels = np.zeros_like(values, dtype=np.int32)

    # Complete the timestamp, and obtain the missing point indicators.
    timestamp, missing, (values, labels) = \
        complete_timestamp(timestamp, (values, labels))

    # Split the training and testing data.
    test_portion = 0.3
    test_n = int(len(values) * test_portion)
    train_values, test_values = values[:-test_n], values[-test_n:]
    train_labels, test_labels = labels[:-test_n], labels[-test_n:]
    train_missing, test_missing = missing[:-test_n], missing[-test_n:]

    # Standardize the training and testing data.
    train_values, mean, std = standardize_kpi(
        train_values, excludes=np.logical_or(train_labels, train_missing))
    test_values, _, _ = standardize_kpi(test_values, mean=mean, std=std)

To construct a Donut model:

.. code-block:: python

    import tensorflow as tf
    from donut import Donut
    from tensorflow import keras as K
    from tfsnippet.modules import Sequential

    # We build the entire model within the scope of `model_vs`,
    # it should hold exactly all the variables of `model`, including
    # the variables created by Keras layers.
    with tf.variable_scope('model') as model_vs:
        model = Donut(
            h_for_p_x=Sequential([
                K.layers.Dense(100, W_regularizer=K.regularizers.l2(0.001),
                               activation=tf.nn.relu),
                K.layers.Dense(100, W_regularizer=K.regularizers.l2(0.001),
                               activation=tf.nn.relu),
            ]),
            h_for_q_z=Sequential([
                K.layers.Dense(100, W_regularizer=K.regularizers.l2(0.001),
                               activation=tf.nn.relu),
                K.layers.Dense(100, W_regularizer=K.regularizers.l2(0.001),
                               activation=tf.nn.relu),
            ]),
            x_dims=120,
            z_dims=5,
        )

To train the Donut model:

.. code-block:: python

    from donut import DonutTrainer

    trainer = DonutTrainer(model=model, model_vs=model_vs)
    with tf.Session().as_default():
        trainer.fit(train_values, train_labels, train_missing, mean, std)

To use a trained Donut model for prediction:

.. code-block:: python

    from donut import DonutPredictor

    predictor = DonutPredictor(model)
    with tf.Session().as_default():
        # Remember to train the model before using the predictor,
        # or to restore the saved model.
        ...

        # Now we can use the predictor.
        test_score = predictor.get_score(test_values, test_missing)

To save and restore a trained model:

.. code-block:: python

    from tfsnippet.utils import get_variables_as_dict, VariableSaver

    with tf.Session().as_default():
        # Train the model.
        ...

        # Remember to get the model variables after the birth of a
        # `predictor` or a `trainer`.  The :class:`Donut` instances
        # does not build the graph until :meth:`Donut.get_score` or
        # :meth:`Donut.get_training_objective` is called, which is
        # done in the `predictor` or the `trainer`.
        var_dict = get_variables_as_dict(model_vs)

        # save variables to `save_dir`
        saver = VariableSaver(var_dict, save_dir)
        saver.save()

    with tf.Session().as_default():
        # Restore variables from `save_dir`.
        saver = VariableSaver(get_variables_as_dict(model_vs), save_dir)
        saver.restore()

donut/__init__.py

0 → 100644
+11 −0
原始行号 差异行号 差异行
__version__ = '0.1'

from .augmentation import *
from .model import *
from .prediction import *
from .preprocessing import *
from .reconstruction import *
from .training import *
from .utils import *

__all__ = ['Donut', 'DonutPredictor', 'DonutTrainer']

donut/augmentation.py

0 → 100644
+96 −0
原始行号 差异行号 差异行
import numpy as np
from tfsnippet.utils import docstring_inherit

__all__ = ['DataAugmentation', 'MissingDataInjection']


class DataAugmentation(object):
    """
    Base class for data augmentation in training.

    Args:
        mean (float): Mean of the training data.
        std (float): Standard deviation of the training data.
    """

    def __init__(self, mean, std):
        if std <= 0.:
            raise ValueError('`std` must be positive')
        self._mean = mean
        self._std = std

    def augment(self, values, labels, missing):
        """
        Generate augmented data.

        Args:
            values (np.ndarray): 1-D float32 array of shape `(data_length,)`,
                the standardized KPI values.
            labels (np.ndarray): 1-D int32 array of shape `(data_length,)`,
                the anomaly labels for `values`.
            missing (np.ndarray): 1-D int32 array of shape `(data_length,)`,
                the indicator of missing points.

        Returns:
            np.ndarray: The augmented KPI values.
            np.ndarray: The augmented labels.
            np.ndarray: The augmented indicators of missing points.
        """
        if len(values.shape) != 1:
            raise ValueError('`values` must be a 1-D array')
        if labels.shape != values.shape:
            raise ValueError('The shape of `labels` does not agree with the '
                             'shape of `values` ({} vs {})'.
                             format(labels.shape, values.shape))
        if missing.shape != values.shape:
            raise ValueError('The shape of `missing` does not agree with the '
                             'shape of `values` ({} vs {})'.
                             format(missing.shape, values.shape))
        return self._augment(values, labels, missing)

    def _augment(self, values, labels, missing):
        """
        Derived classes should override this to actually implement the
        data augmentation algorithm.
        """
        raise NotImplementedError()

    @property
    def mean(self):
        """Get the mean of the training data."""
        return self._mean

    @property
    def std(self):
        """Get the standard deviation of training data."""
        return self._std


class MissingDataInjection(DataAugmentation):
    """
    Data augmentation by injecting missing points into training data.

    Args:
        mean (float): Mean of the training data.
        std (float): Standard deviation of the training data.
        missing_rate (float): The ratio of missing points to inject.
    """

    def __init__(self, mean, std, missing_rate):
        super(MissingDataInjection, self).__init__(mean, std)
        self._missing_rate = missing_rate

    @property
    def missing_rate(self):
        """Get the ratio of missing points to inject."""
        return self._missing_rate

    @docstring_inherit(DataAugmentation.augment)
    def _augment(self, values, labels, missing):
        inject_y = np.random.binomial(1, self.missing_rate, size=values.shape)
        inject_idx = np.where(inject_y.astype(np.bool))[0]
        values = np.copy(values)
        values[inject_idx] = -self.mean / self.std
        missing = np.copy(missing)
        missing[inject_idx] = 1
        return values, labels, missing