JCC-DeepOD/deepod/core/base_model.py

# -*- coding: utf-8 -*-
"""
Base class for deep Anomaly detection models
some functions are adapted from the pyod library
@Author: Hongzuo Xu <hongzuoxu@126.com, xuhongzuo13@nudt.edu.cn>
"""
import sys
import warnings
import pickle
import numpy as np
import torch
import random
import time
from abc import ABCMeta, abstractmethod
from tqdm import tqdm
from scipy.stats import binom
from ray import tune
from ray.air import session, Checkpoint
from ray.tune.schedulers import ASHAScheduler
from functools import partial
from deepod.utils.utility import get_sub_seqs, get_sub_seqs_label
import pickle


class BaseDeepAD(metaclass=ABCMeta):
    """
    Abstract class for deep outlier detection models

    Parameters
    ----------

    data_type: str, optional (default='tabular')
        Data type, choice = ['tabular', 'ts']

    network: str, optional (default='MLP')
        network structure for different data structures

    epochs: int, optional (default=100)
        Number of training epochs

    batch_size: int, optional (default=64)
        Number of samples in a mini-batch

    lr: float, optional (default=1e-3)
        Learning rate

    n_ensemble: int or str, optional (default=1)
        Number of ensemble size

    seq_len: int, optional (default=100)
        Size of window used to create subsequences from the data
        deprecated when handling tabular data (network=='MLP')

    stride: int, optional (default=1)
        number of time points the window will move between two subsequences
        deprecated when handling tabular data (network=='MLP')

    epoch_steps: int, optional (default=-1)
        Maximum steps in an epoch
            - If -1, all the batches will be processed

    prt_steps: int, optional (default=10)
        Number of epoch intervals per printing

    device: str, optional (default='cuda')
        torch device,

    contamination : float in (0., 0.5), optional (default=0.1)
        The amount of contamination of the data set,
        i.e. the proportion of outliers in the data set. Used when fitting to
        define the threshold on the decision function.

    verbose: int, optional (default=1)
        Verbosity mode

    random_state： int, optional (default=42)
        the seed used by the random

    Attributes
    ----------
    decision_scores_ : numpy array of shape (n_samples,)
        The outlier scores of the training data.
        The higher, the more abnormal. Outliers tend to have higher
        scores. This value is available once the detector is fitted.

    threshold_ : float
        The threshold is based on ``contamination``. It is the
        ``n_samples * contamination`` most abnormal samples in
        ``decision_scores_``. The threshold is calculated for generating
        binary outlier labels.

    labels_ : int, either 0 or 1
        The binary labels of the training data. 0 stands for inliers
        and 1 for outliers/anomalies. It is generated by applying
        ``threshold_`` on ``decision_scores_``.

    """
    def __init__(self, model_name, data_type='tabular', network='MLP',
                 epochs=100, batch_size=64, lr=1e-3,
                 n_ensemble=1, seq_len=100, stride=1,
                 epoch_steps=-1, prt_steps=10,
                 device='cuda', contamination=0.1,
                 verbose=1, random_state=42):
        self.model_name = model_name

        self.data_type = data_type
        self.network = network

        # if data_type == 'ts':
        #     assert self.network in sequential_net_name, \
        #         'Assigned network cannot handle time-series data'

        self.seq_len = seq_len
        self.stride = stride

        self.epochs = epochs
        self.batch_size = batch_size
        self.lr = lr

        self.device = device
        self.contamination = contamination

        self.epoch_steps = epoch_steps
        self.prt_steps = prt_steps
        self.verbose = verbose

        self.n_features = -1
        self.n_samples = -1
        self.criterion = None
        self.net = None

        self.n_ensemble = n_ensemble

        self.train_loader = None
        self.test_loader = None

        self.epoch_time = None

        self.train_data = None
        self.train_label = None
        self.val_data = None
        self.val_label = None

        self.decision_scores_ = None
        self.labels_ = None
        self.threshold_ = None

        self.checkpoint_data = {}

        self.random_state = random_state
        self.set_seed(random_state)
        return

    def fit(self, X, y=None):
        """
        Fit detector. y is ignored in unsupervised methods.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        y : numpy array of shape (n_samples, )
            Not used in unsupervised methods, present for API consistency by convention.
            used in (semi-/weakly-) supervised methods

        Returns
        -------
        self : object
            Fitted estimator.
        """

        if self.data_type == 'ts':
            X_seqs = get_sub_seqs(X, seq_len=self.seq_len, stride=self.stride)
            y_seqs = get_sub_seqs_label(y, seq_len=self.seq_len, stride=self.stride) if y is not None else None
            self.train_data = X_seqs
            self.train_label = y_seqs
            self.n_samples, self.n_features = X_seqs.shape[0], X_seqs.shape[2]
        else:
            self.train_data = X
            self.train_label = y
            self.n_samples, self.n_features = X.shape

        if self.verbose >= 1:
            print('Start Training...')

        if self.n_ensemble == 'auto':
            self.n_ensemble = int(np.floor(100 / (np.log(self.n_samples) + self.n_features)) + 1)
        if self.verbose >= 1:
            print(f'ensemble size: {self.n_ensemble}')

        for _ in range(self.n_ensemble):
            self.train_loader, self.net, self.criterion = self.training_prepare(self.train_data,
                                                                                y=self.train_label)
            self._training()

        if self.verbose >= 1:
            print('Start Inference on the training data...')

        self.decision_scores_ = self.decision_function(X)
        self.labels_ = self._process_decision_scores()

        return self

    def fit_auto_hyper(self, X, y=None, X_test=None, y_test=None,
                       n_ray_samples=5, time_budget_s=None):
        """
        Fit detector. y is ignored in unsupervised methods.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        y : numpy array of shape (n_samples, )
            Not used in unsupervised methods, present for API consistency by convention.
            used in (semi-/weakly-) supervised methods

        X_test : numpy array of shape (n_samples, n_features), default=None
            The input testing samples for hyper-parameter tuning.

        y_test : numpy array of shape (n_samples, ), default=None
            Label of input testing samples for hyper-parameter tuning.

        n_ray_samples: int, default=5
            Number of times to sample from the hyperparameter space

        time_budget_s: int, default=None
            Global time budget in seconds after which all trials of Ray are stopped.

        Returns
        -------
        config : dict
            tuned hyper-parameter
        """
        if self.data_type == 'ts':
            self.train_data = get_sub_seqs(X, self.seq_len, self.stride)
            self.train_label = get_sub_seqs_label(y, self.seq_len, self.stride) if y is not None else None
            self.n_samples, self.n_features = self.train_data.shape[0], self.train_data.shape[2]

        elif self.data_type == 'tabular':
            self.train_data = X
            self.train_label = y
            self.n_samples, self.n_features = self.train_data.shape

        else:
            raise NotImplementedError('unsupported data_type')

        config = self.set_tuned_params()
        metric = "loss" if X_test is None else 'metric'
        mode = "min" if X_test is None else 'max'
        scheduler = ASHAScheduler(
            metric=metric,
            mode=mode,
            max_t=self.epochs,
            grace_period=1,
            reduction_factor=2,
        )

        size = sys.getsizeof(self.train_data)/(1024**2)
        if size >= 30:
            split = int(len(self.train_data) / (size / 30))
            self.train_data = self.train_data[:split]
            self.train_label = self.train_label[:split] if y is not None else None
            warnings.warn('split training data to meet the 95 MiB limit of ray ImplitFunc')

        result = tune.run(
            partial(self._training_ray,
                    X_test=X_test, y_test=y_test),
            resources_per_trial={"cpu": 4, "gpu": 0 if self.device == 'cpu' else 1},
            config=config,
            num_samples=n_ray_samples,
            time_budget_s=time_budget_s,
            scheduler=scheduler,
        )

        best_trial = result.get_best_trial(metric=metric, mode=mode, scope="last")
        print(f"Best trial config: {best_trial.config}")
        print(f"Best trial final validation loss: {best_trial.last_result['loss']}")
        print(f"Best trial final testing metric: {best_trial.last_result['metric']}")

        # tuned results
        best_checkpoint = best_trial.checkpoint.to_air_checkpoint().to_dict()
        best_config = best_trial.config
        self.load_ray_checkpoint(best_config=best_config, best_checkpoint=best_checkpoint)

        best_config['epochs'] = best_checkpoint['epoch']

        # testing on the input training data
        self.decision_scores_ = self.decision_function(X)
        self.labels_ = self._process_decision_scores()
        return best_config

    def decision_function(self, X, return_rep=False):
        """Predict raw anomaly scores of X using the fitted detector.

        The anomaly score of an input sample is computed based on the fitted
        detector. For consistency, outliers are assigned with
        higher anomaly scores.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples. Sparse matrices are accepted only
            if they are supported by the base estimator.

        return_rep: boolean, optional, default=False
            whether return representations

        Returns
        -------
        anomaly_scores : numpy array of shape (n_samples,)
            The anomaly score of the input samples.
        """

        testing_n_samples = X.shape[0]

        if self.data_type == 'ts':
            X = get_sub_seqs(X, seq_len=self.seq_len, stride=1)

        representations = []
        s_final = np.zeros(testing_n_samples)
        for _ in range(self.n_ensemble):
            self.test_loader = self.inference_prepare(X)

            z, scores = self._inference()
            z, scores = self.decision_function_update(z, scores)

            if self.data_type == 'ts':
                padding = np.zeros(self.seq_len-1)
                scores = np.hstack((padding, scores))

            s_final += scores
            representations.extend(z)
        representations = np.array(representations)

        if return_rep:
            return s_final, representations
        else:
            return s_final

    def predict(self, X, return_confidence=False):
        """Predict if a particular sample is an outlier or not.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        return_confidence : boolean, optional(default=False)
            If True, also return the confidence of prediction.

        Returns
        -------
        outlier_labels : numpy array of shape (n_samples,)
            For each observation, tells whether
            it should be considered as an outlier according to the
            fitted model. 0 stands for inliers and 1 for outliers.
        confidence : numpy array of shape (n_samples,).
            Only if return_confidence is set to True.
        """

        pred_score = self.decision_function(X)
        prediction = (pred_score > self.threshold_).astype('int').ravel()

        if return_confidence:
            confidence = self._predict_confidence(pred_score)
            return prediction, confidence

        return prediction

    def _predict_confidence(self, test_scores):
        """Predict the model's confidence in making the same prediction
        under slightly different training sets.
        See :cite:`perini2020quantifying`.

        Parameters
        -------
        test_scores : numpy array of shape (n_samples,)
            The anomaly score of the input samples.

        Returns
        -------
        confidence : numpy array of shape (n_samples,)
            For each observation, tells how consistently the model would
            make the same prediction if the training set was perturbed.
            Return a probability, ranging in [0,1].

        """
        n = len(self.decision_scores_)

        count_instances = np.vectorize(lambda x: np.count_nonzero(self.decision_scores_ <= x))
        n_instances = count_instances(test_scores)

        # Derive the outlier probability using Bayesian approach
        posterior_prob = np.vectorize(lambda x: (1 + x) / (2 + n))(n_instances)

        # Transform the outlier probability into a confidence value
        confidence = np.vectorize(
            lambda p: 1 - binom.cdf(n - int(n*self.contamination), n, p)
        )(posterior_prob)
        prediction = (test_scores > self.threshold_).astype('int').ravel()
        np.place(confidence, prediction==0, 1-confidence[prediction == 0])
        return confidence

    def _process_decision_scores(self):
        """Internal function to calculate key attributes:

        - threshold_: used to decide the binary label
        - labels_: binary labels of training data

        Returns
        -------
        self
        """

        self.threshold_ = np.percentile(self.decision_scores_, 100 * (1 - self.contamination))
        self.labels_ = (self.decision_scores_ > self.threshold_).astype('int').ravel()

        self._mu = np.mean(self.decision_scores_)
        self._sigma = np.std(self.decision_scores_)

        return self

    def _training(self):
        optimizer = torch.optim.Adam(self.net.parameters(), lr=self.lr, eps=1e-6)

        self.net.train()
        for i in range(self.epochs):
            t1 = time.time()
            total_loss = 0
            cnt = 0
            for batch_x in self.train_loader:
                loss = self.training_forward(batch_x, self.net, self.criterion)
                self.net.zero_grad()
                loss.backward()
                optimizer.step()

                total_loss += loss.item()
                cnt += 1

                # terminate this epoch when reaching assigned maximum steps per epoch
                if cnt > self.epoch_steps != -1:
                    break

            t = time.time() - t1
            if self.verbose >= 1 and (i == 0 or (i+1) % self.prt_steps == 0):
                print(f'epoch{i+1:3d}, '
                      f'training loss: {total_loss/cnt:.6f}, '
                      f'time: {t:.1f}s')

            if i == 0:
                self.epoch_time = t

            self.epoch_update()

        return

    def _training_ray(self, config, X_test, y_test):
        return

    def _inference(self):
        self.net.eval()
        with torch.no_grad():
            z_lst = []
            score_lst = []

            if self.verbose >= 2:
                _iter_ = tqdm(self.test_loader, desc='testing: ')
            else:
                _iter_ = self.test_loader

            for batch_x in _iter_:
                batch_z, s = self.inference_forward(batch_x, self.net, self.criterion)
                z_lst.append(batch_z)
                score_lst.append(s)

        z = torch.cat(z_lst).data.cpu().numpy()
        scores = torch.cat(score_lst).data.cpu().numpy()

        return z, scores

    @abstractmethod
    def training_forward(self, batch_x, net, criterion):
        """define forward step in training"""
        pass

    @abstractmethod
    def inference_forward(self, batch_x, net, criterion):
        """define forward step in inference"""
        pass

    @abstractmethod
    def training_prepare(self, X, y):
        """define train_loader, net, and criterion"""
        pass

    @abstractmethod
    def inference_prepare(self, X):
        """define test_loader"""
        pass

    def epoch_update(self):
        """for any updating operation after each training epoch"""
        return

    def decision_function_update(self, z, scores):
        """for any updating operation after decision function"""
        return z, scores

    def set_tuned_net(self, config):
        return

    @staticmethod
    def set_tuned_params():
        config = {}
        return config

    def load_ray_checkpoint(self, best_config, best_checkpoint):
        return

    def save_model(self, path):
        with open(path, mode="wb") as f:
            pickle.dump(self, f)

    @classmethod
    def load_model(cls, path):
        with open(path, mode="rb") as f:
            return pickle.load(f)

    @staticmethod
    def set_seed(seed):
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        np.random.seed(seed)
        random.seed(seed)
        # torch.backends.cudnn.benchmark = False
        # torch.backends.cudnn.deterministic = True